1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1996-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_COLLATION
13
14 #include "unicode/unistr.h"
15 #include "unicode/usearch.h"
16
17 #include "cmemory.h"
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/coleitr.h"
21 #include "unicode/ucoleitr.h"
22
23 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
24
25 #include "unicode/testlog.h"
26 #include "unicode/uniset.h"
27 #include "unicode/uset.h"
28 #include "unicode/usetiter.h"
29 #include "unicode/ustring.h"
30 #include "hash.h"
31 #include "normalizer2impl.h"
32 #include "uhash.h"
33 #include "usrchimp.h"
34 #include "uassert.h"
35
36 #include "colldata.h"
37
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))
39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])
41
CEList(UCollator * coll,const UnicodeString & string,UErrorCode & status)42 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
43 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
44 {
45 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
46 UCollationStrength strength = ucol_getStrength(coll);
47 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
48 uint32_t variableTop = ucol_getVariableTop(coll, &status);
49 uint32_t strengthMask = 0;
50 int32_t order;
51
52 if (U_FAILURE(status)) {
53 return;
54 }
55
56 // **** only set flag if string has Han(gul) ****
57 // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476
58
59 switch (strength)
60 {
61 default:
62 strengthMask |= UCOL_TERTIARYORDERMASK;
63 U_FALLTHROUGH;
64 case UCOL_SECONDARY:
65 strengthMask |= UCOL_SECONDARYORDERMASK;
66 U_FALLTHROUGH;
67 case UCOL_PRIMARY:
68 strengthMask |= UCOL_PRIMARYORDERMASK;
69 }
70
71 ces = ceBuffer;
72
73 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
74 UBool cont = isContinuation(order);
75
76 order &= strengthMask;
77
78 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
79 if (strength >= UCOL_QUATERNARY) {
80 order &= UCOL_PRIMARYORDERMASK;
81 } else {
82 order = UCOL_IGNORABLE;
83 }
84 }
85
86 if (order == UCOL_IGNORABLE) {
87 continue;
88 }
89
90 if (cont) {
91 order |= UCOL_CONTINUATION_MARKER;
92 }
93
94 add(order, status);
95 }
96
97 ucol_closeElements(elems);
98 }
99
~CEList()100 CEList::~CEList()
101 {
102 if (ces != ceBuffer) {
103 DELETE_ARRAY(ces);
104 }
105 }
106
add(uint32_t ce,UErrorCode & status)107 void CEList::add(uint32_t ce, UErrorCode &status)
108 {
109 if (U_FAILURE(status)) {
110 return;
111 }
112
113 if (listSize >= listMax) {
114 int32_t newMax = listMax + CELIST_BUFFER_SIZE;
115 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
116
117 if (newCEs == NULL) {
118 status = U_MEMORY_ALLOCATION_ERROR;
119 return;
120 }
121
122 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
123
124 if (ces != ceBuffer) {
125 DELETE_ARRAY(ces);
126 }
127
128 ces = newCEs;
129 listMax = newMax;
130 }
131
132 ces[listSize++] = ce;
133 }
134
get(int32_t index) const135 uint32_t CEList::get(int32_t index) const
136 {
137 if (index >= 0 && index < listSize) {
138 return ces[index];
139 }
140
141 return (uint32_t)UCOL_NULLORDER;
142 }
143
operator [](int32_t index) const144 uint32_t &CEList::operator[](int32_t index) const
145 {
146 return ces[index];
147 }
148
matchesAt(int32_t offset,const CEList * other) const149 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
150 {
151 if (other == NULL || listSize - offset < other->size()) {
152 return FALSE;
153 }
154
155 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
156 if (ces[i] != (*other)[j]) {
157 return FALSE;
158 }
159 }
160
161 return TRUE;
162 }
163
size() const164 int32_t CEList::size() const
165 {
166 return listSize;
167 }
168
StringList(UErrorCode & status)169 StringList::StringList(UErrorCode &status)
170 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
171 {
172 if (U_FAILURE(status)) {
173 return;
174 }
175
176 strings = new UnicodeString [listMax];
177
178 if (strings == NULL) {
179 status = U_MEMORY_ALLOCATION_ERROR;
180 return;
181 }
182 }
183
~StringList()184 StringList::~StringList()
185 {
186 delete[] strings;
187 }
188
add(const UnicodeString * string,UErrorCode & status)189 void StringList::add(const UnicodeString *string, UErrorCode &status)
190 {
191 if (U_FAILURE(status)) {
192 return;
193 }
194 if (listSize >= listMax) {
195 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
196 UnicodeString *newStrings = new UnicodeString[newMax];
197 if (newStrings == NULL) {
198 status = U_MEMORY_ALLOCATION_ERROR;
199 return;
200 }
201 for (int32_t i=0; i<listSize; ++i) {
202 newStrings[i] = strings[i];
203 }
204 delete[] strings;
205 strings = newStrings;
206 listMax = newMax;
207 }
208
209 // The ctor initialized all the strings in
210 // the array to empty strings, so this
211 // is the same as copying the source string.
212 strings[listSize++].append(*string);
213 }
214
add(const UChar * chars,int32_t count,UErrorCode & status)215 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
216 {
217 const UnicodeString string(chars, count);
218
219 add(&string, status);
220 }
221
get(int32_t index) const222 const UnicodeString *StringList::get(int32_t index) const
223 {
224 if (index >= 0 && index < listSize) {
225 return &strings[index];
226 }
227
228 return NULL;
229 }
230
size() const231 int32_t StringList::size() const
232 {
233 return listSize;
234 }
235
236
237 U_CDECL_BEGIN
238 static void U_CALLCONV
deleteStringList(void * obj)239 deleteStringList(void *obj)
240 {
241 StringList *strings = (StringList *) obj;
242
243 delete strings;
244 }
245 U_CDECL_END
246
247 class CEToStringsMap
248 {
249 public:
250 CEToStringsMap(UErrorCode &status);
251 ~CEToStringsMap();
252
253 void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
254 StringList *getStringList(uint32_t ce) const;
255
256 private:
257 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
258 UHashtable *map;
259 };
260
CEToStringsMap(UErrorCode & status)261 CEToStringsMap::CEToStringsMap(UErrorCode &status)
262 : map(NULL)
263 {
264 if (U_FAILURE(status)) {
265 return;
266 }
267
268 map = uhash_open(uhash_hashLong, uhash_compareLong,
269 uhash_compareCaselessUnicodeString,
270 &status);
271
272 if (U_FAILURE(status)) {
273 return;
274 }
275
276 uhash_setValueDeleter(map, deleteStringList);
277 }
278
~CEToStringsMap()279 CEToStringsMap::~CEToStringsMap()
280 {
281 uhash_close(map);
282 }
283
put(uint32_t ce,UnicodeString * string,UErrorCode & status)284 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
285 {
286 StringList *strings = getStringList(ce);
287
288 if (strings == NULL) {
289 strings = new StringList(status);
290
291 if (strings == NULL || U_FAILURE(status)) {
292 status = U_MEMORY_ALLOCATION_ERROR;
293 return;
294 }
295
296 putStringList(ce, strings, status);
297 }
298
299 strings->add(string, status);
300 }
301
getStringList(uint32_t ce) const302 StringList *CEToStringsMap::getStringList(uint32_t ce) const
303 {
304 return (StringList *) uhash_iget(map, ce);
305 }
306
putStringList(uint32_t ce,StringList * stringList,UErrorCode & status)307 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
308 {
309 uhash_iput(map, ce, (void *) stringList, &status);
310 }
311
312 #define CLONE_COLLATOR
313
CollData(UCollator * collator,UErrorCode & status)314 CollData::CollData(UCollator *collator, UErrorCode &status)
315 : coll(NULL), ceToCharsStartingWith(NULL)
316 {
317 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
318 // i.e. other, control, private use, format, surrogate
319 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
320 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
321 USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
322
323 // Han ext. A, Han, Jamo, Hangul, Han Ext. B
324 // i.e. all the characers we handle implicitly
325 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
326 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
327 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
328
329 if (U_FAILURE(status)) {
330 return;
331 }
332
333 USet *expansions = uset_openEmpty();
334 USet *contractions = uset_openEmpty();
335 int32_t itemCount;
336
337 ceToCharsStartingWith = new CEToStringsMap(status);
338
339 if (U_FAILURE(status)) {
340 goto bail;
341 }
342
343 #ifdef CLONE_COLLATOR
344 coll = ucol_safeClone(collator, NULL, NULL, &status);
345
346 if (U_FAILURE(status)) {
347 goto bail;
348 }
349 #else
350 coll = collator;
351 #endif
352
353 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
354
355 uset_addAll(charsToTest, contractions);
356 uset_addAll(charsToTest, expansions);
357 uset_removeAll(charsToTest, charsToRemove);
358
359 itemCount = uset_getItemCount(charsToTest);
360 for(int32_t item = 0; item < itemCount; item += 1) {
361 UChar32 start = 0, end = 0;
362 UChar buffer[16];
363 int32_t len = uset_getItem(charsToTest, item, &start, &end,
364 buffer, 16, &status);
365
366 if (len == 0) {
367 for (UChar32 ch = start; ch <= end; ch += 1) {
368 UnicodeString *st = new UnicodeString(ch);
369
370 if (st == NULL) {
371 status = U_MEMORY_ALLOCATION_ERROR;
372 break;
373 }
374
375 CEList *ceList = new CEList(coll, *st, status);
376
377 ceToCharsStartingWith->put(ceList->get(0), st, status);
378
379 delete ceList;
380 delete st;
381 }
382 } else if (len > 0) {
383 UnicodeString *st = new UnicodeString(buffer, len);
384
385 if (st == NULL) {
386 status = U_MEMORY_ALLOCATION_ERROR;
387 break;
388 }
389
390 CEList *ceList = new CEList(coll, *st, status);
391
392 ceToCharsStartingWith->put(ceList->get(0), st, status);
393
394 delete ceList;
395 delete st;
396 } else {
397 // shouldn't happen...
398 }
399
400 if (U_FAILURE(status)) {
401 break;
402 }
403 }
404
405 bail:
406 uset_close(contractions);
407 uset_close(expansions);
408 uset_close(charsToRemove);
409 uset_close(charsToTest);
410
411 if (U_FAILURE(status)) {
412 return;
413 }
414
415 UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
416 if (U_FAILURE(status)) {
417 return;
418 }
419 UnicodeSetIterator hanIter(hanRanges);
420 UnicodeString hanString;
421 while(hanIter.nextRange()) {
422 hanString.append(hanIter.getCodepoint());
423 hanString.append(hanIter.getCodepointEnd());
424 }
425 // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
426 // but as of Unicode 6.3 the 11xx block is filled,
427 // and there are also more Jamo T at U+D7CB..U+D7FB.
428 // Maybe use [:HST=T:] and look for the end of the last range?
429 // Maybe use script boundary mappings instead of this code??
430 UChar jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
431 UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges));
432 CEList hanList(coll, hanString, status);
433 CEList jamoList(coll, jamoString, status);
434 int32_t j = 0;
435
436 if (U_FAILURE(status)) {
437 return;
438 }
439
440 for (int32_t c = 0; c < jamoList.size(); c += 1) {
441 uint32_t jce = jamoList[c];
442
443 if (! isContinuation(jce)) {
444 jamoLimits[j++] = jce;
445 }
446 }
447
448 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
449
450 minHan = 0xFFFFFFFF;
451 maxHan = 0;
452
453 for(int32_t h = 0; h < hanList.size(); h += 2) {
454 uint32_t han = (uint32_t) hanList[h];
455
456 if (han < minHan) {
457 minHan = han;
458 }
459
460 if (han > maxHan) {
461 maxHan = han;
462 }
463 }
464
465 maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
466 }
467
~CollData()468 CollData::~CollData()
469 {
470 #ifdef CLONE_COLLATOR
471 ucol_close(coll);
472 #endif
473
474 delete ceToCharsStartingWith;
475 }
476
getCollator() const477 UCollator *CollData::getCollator() const
478 {
479 return coll;
480 }
481
getStringList(int32_t ce) const482 const StringList *CollData::getStringList(int32_t ce) const
483 {
484 return ceToCharsStartingWith->getStringList(ce);
485 }
486
getCEList(const UnicodeString * string) const487 const CEList *CollData::getCEList(const UnicodeString *string) const
488 {
489 UErrorCode status = U_ZERO_ERROR;
490 const CEList *list = new CEList(coll, *string, status);
491
492 if (U_FAILURE(status)) {
493 delete list;
494 list = NULL;
495 }
496
497 return list;
498 }
499
freeCEList(const CEList * list)500 void CollData::freeCEList(const CEList *list)
501 {
502 delete list;
503 }
504
minLengthInChars(const CEList * ceList,int32_t offset,int32_t * history) const505 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
506 {
507 // find out shortest string for the longest sequence of ces.
508 // this can probably be folded with the minLengthCache...
509
510 if (history[offset] >= 0) {
511 return history[offset];
512 }
513
514 uint32_t ce = ceList->get(offset);
515 int32_t maxOffset = ceList->size();
516 int32_t shortestLength = INT32_MAX;
517 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
518
519 if (strings != NULL) {
520 int32_t stringCount = strings->size();
521
522 for (int32_t s = 0; s < stringCount; s += 1) {
523 const UnicodeString *string = strings->get(s);
524 UErrorCode status = U_ZERO_ERROR;
525 const CEList *ceList2 = new CEList(coll, *string, status);
526
527 if (U_FAILURE(status)) {
528 delete ceList2;
529 ceList2 = NULL;
530 }
531
532 if (ceList->matchesAt(offset, ceList2)) {
533 U_ASSERT(ceList2 != NULL);
534 int32_t clength = ceList2->size();
535 int32_t slength = string->length();
536 int32_t roffset = offset + clength;
537 int32_t rlength = 0;
538
539 if (roffset < maxOffset) {
540 rlength = minLengthInChars(ceList, roffset, history);
541
542 if (rlength <= 0) {
543 // delete before continue to avoid memory leak.
544 delete ceList2;
545
546 // ignore any dead ends
547 continue;
548 }
549 }
550
551 if (shortestLength > slength + rlength) {
552 shortestLength = slength + rlength;
553 }
554 }
555
556 delete ceList2;
557 }
558 }
559
560 if (shortestLength == INT32_MAX) {
561 // No matching strings at this offset. See if
562 // the CE is in a range we can handle manually.
563 if (ce >= minHan && ce < maxHan) {
564 // all han have implicit orders which
565 // generate two CEs.
566 int32_t roffset = offset + 2;
567 int32_t rlength = 0;
568
569 //history[roffset++] = -1;
570 //history[roffset++] = 1;
571
572 if (roffset < maxOffset) {
573 rlength = minLengthInChars(ceList, roffset, history);
574 }
575
576 if (rlength < 0) {
577 return -1;
578 }
579
580 shortestLength = 1 + rlength;
581 goto have_shortest;
582 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
583 int32_t roffset = offset;
584 int32_t rlength = 0;
585
586 // **** this loop may not handle archaic Hangul correctly ****
587 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
588 uint32_t jce = ceList->get(roffset);
589
590 // Some Jamo have 24-bit primary order; skip the
591 // 2nd CE. This should always be OK because if
592 // we're still in the loop all we've seen are
593 // a series of Jamo in LVT order.
594 if (isContinuation(jce)) {
595 continue;
596 }
597
598 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
599 break;
600 }
601 }
602
603 if (roffset == offset) {
604 // we started with a non-L Jamo...
605 // just say it comes from a single character
606 roffset += 1;
607
608 // See if the single Jamo has a 24-bit order.
609 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
610 roffset += 1;
611 }
612 }
613
614 if (roffset < maxOffset) {
615 rlength = minLengthInChars(ceList, roffset, history);
616 }
617
618 if (rlength < 0) {
619 return -1;
620 }
621
622 shortestLength = 1 + rlength;
623 goto have_shortest;
624 }
625
626 // Can't handle it manually either. Just move on.
627 return -1;
628 }
629
630 have_shortest:
631 history[offset] = shortestLength;
632
633 return shortestLength;
634 }
635
minLengthInChars(const CEList * ceList,int32_t offset) const636 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
637 {
638 int32_t clength = ceList->size();
639 int32_t *history = NEW_ARRAY(int32_t, clength);
640
641 for (int32_t i = 0; i < clength; i += 1) {
642 history[i] = -1;
643 }
644
645 int32_t minLength = minLengthInChars(ceList, offset, history);
646
647 DELETE_ARRAY(history);
648
649 return minLength;
650 }
651
652 #endif // #if !UCONFIG_NO_COLLATION
653