1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1997-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_COLLATION
12
13 #include "unicode/coll.h"
14 #include "unicode/tblcoll.h"
15 #include "unicode/unistr.h"
16 #include "unicode/sortkey.h"
17 #include "itercoll.h"
18 #include "unicode/schriter.h"
19 #include "unicode/chariter.h"
20 #include "unicode/uchar.h"
21 #include "cmemory.h"
22
23 static UErrorCode status = U_ZERO_ERROR;
24
CollationIteratorTest()25 CollationIteratorTest::CollationIteratorTest()
26 : test1("What subset of all possible test cases?", ""),
27 test2("has the highest probability of detecting", "")
28 {
29 en_us = dynamic_cast<RuleBasedCollator*>(Collator::createInstance(Locale::getUS(), status));
30 if(U_FAILURE(status)) {
31 delete en_us;
32 en_us = nullptr;
33 errcheckln(status, "Collator creation failed with %s", u_errorName(status));
34 return;
35 }
36
37 }
38
~CollationIteratorTest()39 CollationIteratorTest::~CollationIteratorTest()
40 {
41 delete en_us;
42 }
43
44 /**
45 * Test for CollationElementIterator previous and next for the whole set of
46 * unicode characters.
47 */
TestUnicodeChar()48 void CollationIteratorTest::TestUnicodeChar()
49 {
50 CollationElementIterator *iter;
51 char16_t codepoint;
52 UnicodeString source;
53
54 for (codepoint = 1; codepoint < 0xFFFE;)
55 {
56 source.remove();
57
58 while (codepoint % 0xFF != 0)
59 {
60 if (u_isdefined(codepoint))
61 source += codepoint;
62 codepoint ++;
63 }
64
65 if (u_isdefined(codepoint))
66 source += codepoint;
67
68 if (codepoint != 0xFFFF)
69 codepoint ++;
70
71 iter = en_us->createCollationElementIterator(source);
72 /* A basic test to see if it's working at all */
73 backAndForth(*iter);
74 delete iter;
75 }
76 }
77
78 /**
79 * Test for CollationElementIterator.previous()
80 *
81 * @bug 4108758 - Make sure it works with contracting characters
82 *
83 */
TestPrevious()84 void CollationIteratorTest::TestPrevious(/* char* par */)
85 {
86 UErrorCode status = U_ZERO_ERROR;
87 CollationElementIterator *iter = en_us->createCollationElementIterator(test1);
88
89 // A basic test to see if it's working at all
90 backAndForth(*iter);
91 delete iter;
92
93 // Test with a contracting character sequence
94 UnicodeString source;
95 RuleBasedCollator *c1 = nullptr;
96 c1 = new RuleBasedCollator(
97 UnicodeString("&a,A < b,B < c,C, d,D < z,Z < ch,cH,Ch,CH"), status);
98
99 if (c1 == nullptr || U_FAILURE(status))
100 {
101 errln("Couldn't create a RuleBasedCollator with a contracting sequence.");
102 delete c1;
103 return;
104 }
105
106 source = "abchdcba";
107 iter = c1->createCollationElementIterator(source);
108 backAndForth(*iter);
109 delete iter;
110 delete c1;
111
112 // Test with an expanding character sequence
113 RuleBasedCollator *c2 = nullptr;
114 c2 = new RuleBasedCollator(UnicodeString("&a < b < c/abd < d"), status);
115
116 if (c2 == nullptr || U_FAILURE(status))
117 {
118 errln("Couldn't create a RuleBasedCollator with an expanding sequence.");
119 delete c2;
120 return;
121 }
122
123 source = "abcd";
124 iter = c2->createCollationElementIterator(source);
125 backAndForth(*iter);
126 delete iter;
127 delete c2;
128
129 // Now try both
130 RuleBasedCollator *c3 = nullptr;
131 c3 = new RuleBasedCollator(UnicodeString("&a < b < c/aba < d < z < ch"), status);
132
133 if (c3 == nullptr || U_FAILURE(status))
134 {
135 errln("Couldn't create a RuleBasedCollator with both an expanding and a contracting sequence.");
136 delete c3;
137 return;
138 }
139
140 source = "abcdbchdc";
141 iter = c3->createCollationElementIterator(source);
142 backAndForth(*iter);
143 delete iter;
144 delete c3;
145
146 status=U_ZERO_ERROR;
147 source= CharsToUnicodeString("\\u0e41\\u0e02\\u0e41\\u0e02\\u0e27abc");
148
149 Collator *c4 = Collator::createInstance(Locale("th", "TH", ""), status);
150 if(U_FAILURE(status)){
151 errln("Couldn't create a collator");
152 }
153 iter = (dynamic_cast<RuleBasedCollator*>(c4))->createCollationElementIterator(source);
154 backAndForth(*iter);
155 delete iter;
156 delete c4;
157
158 source= CharsToUnicodeString("\\u0061\\u30CF\\u3099\\u30FC");
159 Collator *c5 = Collator::createInstance(Locale("ja", "JP", ""), status);
160
161 iter = (dynamic_cast<RuleBasedCollator*>(c5))->createCollationElementIterator(source);
162 if(U_FAILURE(status)){
163 errln("Couldn't create Japanese collator\n");
164 }
165 backAndForth(*iter);
166 delete iter;
167 delete c5;
168 }
169
170 /**
171 * Test for getOffset() and setOffset()
172 */
TestOffset()173 void CollationIteratorTest::TestOffset(/* char* par */)
174 {
175 CollationElementIterator *iter = en_us->createCollationElementIterator(test1);
176 UErrorCode status = U_ZERO_ERROR;
177 // testing boundaries
178 iter->setOffset(0, status);
179 if (U_FAILURE(status) || iter->previous(status) != CollationElementIterator::NULLORDER) {
180 errln("Error: After setting offset to 0, we should be at the end "
181 "of the backwards iteration");
182 }
183 iter->setOffset(test1.length(), status);
184 if (U_FAILURE(status) || iter->next(status) != CollationElementIterator::NULLORDER) {
185 errln("Error: After setting offset to end of the string, we should "
186 "be at the end of the backwards iteration");
187 }
188
189 // Run all the way through the iterator, then get the offset
190 int32_t orderLength = 0;
191 Order *orders = getOrders(*iter, orderLength);
192
193 int32_t offset = iter->getOffset();
194
195 if (offset != test1.length())
196 {
197 UnicodeString msg1("offset at end != length: ");
198 UnicodeString msg2(" vs ");
199
200 errln(msg1 + offset + msg2 + test1.length());
201 }
202
203 // Now set the offset back to the beginning and see if it works
204 CollationElementIterator *pristine = en_us->createCollationElementIterator(test1);
205
206 iter->setOffset(0, status);
207
208 if (U_FAILURE(status))
209 {
210 errln("setOffset failed.");
211 }
212 else
213 {
214 assertEqual(*iter, *pristine);
215 }
216
217 delete pristine;
218 delete[] orders;
219 delete iter;
220
221 // setting offset in the middle of a contraction
222 UnicodeString contraction = "change";
223 status = U_ZERO_ERROR;
224 RuleBasedCollator tailored("& a < ch", status);
225 if (U_FAILURE(status)) {
226 errln("Error: in creation of Spanish collator - %s", u_errorName(status));
227 return;
228 }
229 iter = tailored.createCollationElementIterator(contraction);
230 Order *order = getOrders(*iter, orderLength);
231 iter->setOffset(1, status); // sets offset in the middle of ch
232 int32_t order2Length = 0;
233 Order *order2 = getOrders(*iter, order2Length);
234 if (orderLength != order2Length || uprv_memcmp(order, order2, orderLength * sizeof(Order)) != 0) {
235 errln("Error: setting offset in the middle of a contraction should be the same as setting it to the start of the contraction");
236 }
237 delete[] order;
238 delete[] order2;
239 delete iter;
240 contraction = "peache";
241 iter = tailored.createCollationElementIterator(contraction);
242 iter->setOffset(3, status);
243 order = getOrders(*iter, orderLength);
244 iter->setOffset(4, status); // sets offset in the middle of ch
245 order2 = getOrders(*iter, order2Length);
246 if (orderLength != order2Length || uprv_memcmp(order, order2, orderLength * sizeof(Order)) != 0) {
247 errln("Error: setting offset in the middle of a contraction should be the same as setting it to the start of the contraction");
248 }
249 delete[] order;
250 delete[] order2;
251 delete iter;
252 // setting offset in the middle of a surrogate pair
253 UnicodeString surrogate = UNICODE_STRING_SIMPLE("\\ud800\\udc00str").unescape();
254 iter = tailored.createCollationElementIterator(surrogate);
255 order = getOrders(*iter, orderLength);
256 iter->setOffset(1, status); // sets offset in the middle of surrogate
257 order2 = getOrders(*iter, order2Length);
258 if (orderLength != order2Length || uprv_memcmp(order, order2, orderLength * sizeof(Order)) != 0) {
259 errln("Error: setting offset in the middle of a surrogate pair should be the same as setting it to the start of the surrogate pair");
260 }
261 delete[] order;
262 delete[] order2;
263 delete iter;
264 surrogate = UNICODE_STRING_SIMPLE("simple\\ud800\\udc00str").unescape();
265 iter = tailored.createCollationElementIterator(surrogate);
266 iter->setOffset(6, status);
267 order = getOrders(*iter, orderLength);
268 iter->setOffset(7, status); // sets offset in the middle of surrogate
269 order2 = getOrders(*iter, order2Length);
270 if (orderLength != order2Length || uprv_memcmp(order, order2, orderLength * sizeof(Order)) != 0) {
271 errln("Error: setting offset in the middle of a surrogate pair should be the same as setting it to the start of the surrogate pair");
272 }
273 delete[] order;
274 delete[] order2;
275 delete iter;
276 // TODO: try iterating halfway through a messy string.
277 }
278
279 /**
280 * Test for setText()
281 */
TestSetText()282 void CollationIteratorTest::TestSetText(/* char* par */)
283 {
284 CollationElementIterator *iter1 = en_us->createCollationElementIterator(test1);
285 CollationElementIterator *iter2 = en_us->createCollationElementIterator(test2);
286 UErrorCode status = U_ZERO_ERROR;
287
288 // Run through the second iterator just to exercise it
289 int32_t c = iter2->next(status);
290 int32_t i = 0;
291
292 while ( ++i < 10 && c != CollationElementIterator::NULLORDER)
293 {
294 if (U_FAILURE(status))
295 {
296 errln("iter2->next() returned an error.");
297 delete iter2;
298 delete iter1;
299 }
300
301 c = iter2->next(status);
302 }
303
304 // Now set it to point to the same string as the first iterator
305 iter2->setText(test1, status);
306
307 if (U_FAILURE(status))
308 {
309 errln("call to iter2->setText(test1) failed.");
310 }
311 else
312 {
313 assertEqual(*iter1, *iter2);
314 }
315 iter1->reset();
316 //now use the overloaded setText(CharacterIterator&, UErrorCode) function to set the text
317 CharacterIterator* chariter = new StringCharacterIterator(test1);
318 iter2->setText(*chariter, status);
319 if (U_FAILURE(status))
320 {
321 errln("call to iter2->setText(chariter(test1)) failed.");
322 }
323 else
324 {
325 assertEqual(*iter1, *iter2);
326 }
327
328 // test for an empty string
329 UnicodeString empty("");
330 iter1->setText(empty, status);
331 if (U_FAILURE(status)
332 || iter1->next(status) != static_cast<int32_t>(CollationElementIterator::NULLORDER)) {
333 errln("Empty string should have no CEs.");
334 }
335 (dynamic_cast<StringCharacterIterator*>(chariter))->setText(empty);
336 iter1->setText(*chariter, status);
337 if (U_FAILURE(status)
338 || iter1->next(status) != static_cast<int32_t>(CollationElementIterator::NULLORDER)) {
339 errln("Empty string should have no CEs.");
340 }
341 delete chariter;
342 delete iter2;
343 delete iter1;
344 }
345
346 /** @bug 4108762
347 * Test for getMaxExpansion()
348 */
TestMaxExpansion()349 void CollationIteratorTest::TestMaxExpansion(/* char* par */)
350 {
351 UErrorCode status = U_ZERO_ERROR;
352 UnicodeString rule("&a < ab < c/aba < d < z < ch");
353 RuleBasedCollator *coll = new RuleBasedCollator(rule, status);
354 char16_t ch = 0;
355 UnicodeString str(ch);
356
357 CollationElementIterator *iter = coll->createCollationElementIterator(str);
358
359 while (ch < 0xFFFF && U_SUCCESS(status)) {
360 int count = 1;
361 uint32_t order;
362 ch ++;
363 UnicodeString str(ch);
364 iter->setText(str, status);
365 order = iter->previous(status);
366
367 /* thai management */
368 if (CollationElementIterator::isIgnorable(order))
369 order = iter->previous(status);
370
371 while (U_SUCCESS(status)
372 && iter->previous(status) != static_cast<int32_t>(CollationElementIterator::NULLORDER))
373 {
374 count ++;
375 }
376
377 if (U_FAILURE(status) && iter->getMaxExpansion(order) < count) {
378 errln("Failure at codepoint %d, maximum expansion count < %d\n",
379 ch, count);
380 }
381 }
382
383 delete iter;
384 delete coll;
385 }
386
387 /*
388 * @bug 4157299
389 */
TestClearBuffers()390 void CollationIteratorTest::TestClearBuffers(/* char* par */)
391 {
392 UErrorCode status = U_ZERO_ERROR;
393 RuleBasedCollator* c = new RuleBasedCollator(UnicodeString("&a < b < c & ab = d"), status);
394
395 if (c == nullptr || U_FAILURE(status))
396 {
397 errln("Couldn't create a RuleBasedCollator.");
398 delete c;
399 return;
400 }
401
402 UnicodeString source("abcd");
403 CollationElementIterator *i = c->createCollationElementIterator(source);
404 int32_t e0 = i->next(status); // save the first collation element
405
406 if (U_FAILURE(status))
407 {
408 errln("call to i->next() failed. err=%s", u_errorName(status));
409 }
410 else
411 {
412 i->setOffset(3, status); // go to the expanding character
413
414 if (U_FAILURE(status))
415 {
416 errln("call to i->setOffset(3) failed. err=%s", u_errorName(status));
417 }
418 else
419 {
420 i->next(status); // but only use up half of it
421
422 if (U_FAILURE(status))
423 {
424 errln("call to i->next() failed. err=%s", u_errorName(status));
425 }
426 else
427 {
428 i->setOffset(0, status); // go back to the beginning
429
430 if (U_FAILURE(status))
431 {
432 errln("call to i->setOffset(0) failed. err=%s", u_errorName(status));
433 }
434 else
435 {
436 int32_t e = i->next(status); // and get this one again
437
438 if (U_FAILURE(status))
439 {
440 errln("call to i->next() failed. err=%s", u_errorName(status));
441 }
442 else if (e != e0)
443 {
444 errln("got 0x%X, expected 0x%X", e, e0);
445 }
446 }
447 }
448 }
449 }
450
451 delete i;
452 delete c;
453 }
454
455 /**
456 * Testing the assignment operator
457 */
TestAssignment()458 void CollationIteratorTest::TestAssignment()
459 {
460 UErrorCode status = U_ZERO_ERROR;
461 RuleBasedCollator *coll =
462 dynamic_cast<RuleBasedCollator*>(Collator::createInstance(status));
463
464 if (coll == nullptr || U_FAILURE(status))
465 {
466 errln("Couldn't create a default collator.");
467 return;
468 }
469
470 UnicodeString source("abcd");
471 CollationElementIterator *iter1 =
472 coll->createCollationElementIterator(source);
473
474 CollationElementIterator iter2 = *iter1;
475
476 if (*iter1 != iter2) {
477 errln("Fail collation iterator assignment does not produce the same elements");
478 }
479
480 CollationElementIterator iter3(*iter1);
481
482 if (*iter1 != iter3) {
483 errln("Fail collation iterator copy constructor does not produce the same elements");
484 }
485
486 source = CharsToUnicodeString("a\\u0300\\u0325");
487 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
488 CollationElementIterator *iter4
489 = coll->createCollationElementIterator(source);
490 CollationElementIterator iter5(*iter4);
491 int32_t order4, order5;
492 if (*iter4 != iter5) {
493 errln("collation iterator assignment does not produce the same elements");
494 }
495 order4 = iter4->next(status);
496 if (U_FAILURE(status) || *iter4 == iter5) {
497 errln("collation iterator not equal");
498 }
499 order5 = iter5.next(status);
500 if (U_FAILURE(status) || *iter4 != iter5) {
501 errln("collation iterator equal");
502 }
503 order4 = iter4->next(status);
504 if (U_FAILURE(status) || *iter4 == iter5) {
505 errln("collation iterator not equal");
506 }
507 order5 = iter5.next(status);
508 if (U_FAILURE(status) || *iter4 != iter5) {
509 errln("collation iterator equal");
510 }
511 CollationElementIterator iter6(*iter4);
512 if (*iter4 != iter6) {
513 errln("collation iterator equal");
514 }
515 order4 = iter4->next(status);
516 if (U_FAILURE(status) || *iter4 == iter5) {
517 errln("collation iterator not equal");
518 }
519 order5 = iter5.next(status);
520 if (U_FAILURE(status) || *iter4 != iter5) {
521 errln("collation iterator equal");
522 }
523 if (!(order4 == CollationElementIterator::NULLORDER &&
524 order5 == CollationElementIterator::NULLORDER)) {
525 order4 = iter4->next(status);
526 if (U_FAILURE(status) || *iter4 == iter5) {
527 errln("collation iterator not equal");
528 }
529 order5 = iter5.next(status);
530 if (U_FAILURE(status) || *iter4 != iter5) {
531 errln("collation iterator equal");
532 }
533 }
534 delete iter1;
535 delete iter4;
536 delete coll;
537 }
538
539 /**
540 * Testing the constructors
541 */
TestConstructors()542 void CollationIteratorTest::TestConstructors()
543 {
544 UErrorCode status = U_ZERO_ERROR;
545 RuleBasedCollator *coll =
546 dynamic_cast<RuleBasedCollator*>(Collator::createInstance(status));
547 if (coll == nullptr || U_FAILURE(status))
548 {
549 errln("Couldn't create a default collator.");
550 return;
551 }
552
553 // testing protected constructor with character iterator as argument
554 StringCharacterIterator chariter(test1);
555 CollationElementIterator *iter1 =
556 coll->createCollationElementIterator(chariter);
557 if (U_FAILURE(status)) {
558 errln("Couldn't create collation element iterator with character iterator.");
559 return;
560 }
561 CollationElementIterator *iter2 =
562 coll->createCollationElementIterator(test1);
563
564 // initially the 2 collation element iterators should be the same
565 if (*iter1 != *iter1 || *iter2 != *iter2 || *iter1 != *iter2
566 || *iter2 != *iter1) {
567 errln("CollationElementIterators constructed with the same string data should be the same at the start");
568 }
569 assertEqual(*iter1, *iter2);
570
571 delete iter1;
572 delete iter2;
573
574 // tests empty strings
575 UnicodeString empty("");
576 iter1 = coll->createCollationElementIterator(empty);
577 chariter.setText(empty);
578 iter2 = coll->createCollationElementIterator(chariter);
579 if (*iter1 != *iter1 || *iter2 != *iter2 || *iter1 != *iter2
580 || *iter2 != *iter1) {
581 errln("CollationElementIterators constructed with the same string data should be the same at the start");
582 }
583 if (iter1->next(status) != static_cast<int32_t>(CollationElementIterator::NULLORDER)) {
584 errln("Empty string should have no CEs.");
585 }
586 if (iter2->next(status) != static_cast<int32_t>(CollationElementIterator::NULLORDER)) {
587 errln("Empty string should have no CEs.");
588 }
589 delete iter1;
590 delete iter2;
591 delete coll;
592 }
593
594 /**
595 * Testing the strength order
596 */
TestStrengthOrder()597 void CollationIteratorTest::TestStrengthOrder()
598 {
599 int order = 0x0123ABCD;
600
601 UErrorCode status = U_ZERO_ERROR;
602 RuleBasedCollator *coll =
603 dynamic_cast<RuleBasedCollator*>(Collator::createInstance(status));
604 if (coll == nullptr || U_FAILURE(status))
605 {
606 errln("Couldn't create a default collator.");
607 return;
608 }
609
610 coll->setStrength(Collator::PRIMARY);
611 CollationElementIterator *iter =
612 coll->createCollationElementIterator(test1);
613
614 if (iter == nullptr) {
615 errln("Couldn't create a collation element iterator from default collator");
616 return;
617 }
618
619 if (iter->strengthOrder(order) != 0x01230000) {
620 errln("Strength order for a primary strength collator should be the first 2 bytes");
621 return;
622 }
623
624 coll->setStrength(Collator::SECONDARY);
625 if (iter->strengthOrder(order) != 0x0123AB00) {
626 errln("Strength order for a secondary strength collator should be the third byte");
627 return;
628 }
629
630 coll->setStrength(Collator::TERTIARY);
631 if (iter->strengthOrder(order) != order) {
632 errln("Strength order for a tertiary strength collator should be the third byte");
633 return;
634 }
635 delete iter;
636 delete coll;
637 }
638
639 /**
640 * Return a string containing all of the collation orders
641 * returned by calls to next on the specified iterator
642 */
orderString(CollationElementIterator & iter,UnicodeString & target)643 UnicodeString &CollationIteratorTest::orderString(CollationElementIterator &iter, UnicodeString &target)
644 {
645 int32_t order;
646 UErrorCode status = U_ZERO_ERROR;
647
648 while ((order = iter.next(status)) != CollationElementIterator::NULLORDER)
649 {
650 target += "0x";
651 appendHex(order, 8, target);
652 target += " ";
653 }
654
655 return target;
656 }
657
assertEqual(CollationElementIterator & i1,CollationElementIterator & i2)658 void CollationIteratorTest::assertEqual(CollationElementIterator &i1, CollationElementIterator &i2)
659 {
660 int32_t c1, c2, count = 0;
661 UErrorCode status = U_ZERO_ERROR;
662
663 do
664 {
665 c1 = i1.next(status);
666 c2 = i2.next(status);
667
668 if (c1 != c2)
669 {
670 errln(" %d: strength(0x%X) != strength(0x%X)", count, c1, c2);
671 break;
672 }
673
674 count += 1;
675 }
676 while (c1 != CollationElementIterator::NULLORDER);
677 }
678
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)679 void CollationIteratorTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* /*par*/)
680 {
681 if (exec)
682 {
683 logln("Collation Iteration Tests: ");
684 }
685
686 if(en_us) {
687 switch (index)
688 {
689 case 0: name = "TestPrevious"; if (exec) TestPrevious(/* par */); break;
690 case 1: name = "TestOffset"; if (exec) TestOffset(/* par */); break;
691 case 2: name = "TestSetText"; if (exec) TestSetText(/* par */); break;
692 case 3: name = "TestMaxExpansion"; if (exec) TestMaxExpansion(/* par */); break;
693 case 4: name = "TestClearBuffers"; if (exec) TestClearBuffers(/* par */); break;
694 case 5: name = "TestUnicodeChar"; if (exec) TestUnicodeChar(/* par */); break;
695 case 6: name = "TestAssignment"; if (exec) TestAssignment(/* par */); break;
696 case 7: name = "TestConstructors"; if (exec) TestConstructors(/* par */); break;
697 case 8: name = "TestStrengthOrder"; if (exec) TestStrengthOrder(/* par */); break;
698 default: name = ""; break;
699 }
700 } else {
701 dataerrln("Class iterator not instantiated");
702 name = "";
703 }
704 }
705
706 #endif /* #if !UCONFIG_NO_COLLATION */
707