• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 1997-2013, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 /********************************************************************************
7 *
8 * File CITERTST.C
9 *
10 * Modification History:
11 * Date      Name               Description
12 *           Madhu Katragadda   Ported for C API
13 * 02/19/01  synwee             Modified test case for new collation iterator
14 *********************************************************************************/
15 /*
16  * Collation Iterator tests.
17  * (Let me reiterate my position...)
18  */
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_COLLATION
23 
24 #include "unicode/ucol.h"
25 #include "unicode/ucoleitr.h"
26 #include "unicode/uloc.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ustring.h"
29 #include "unicode/putil.h"
30 #include "callcoll.h"
31 #include "cmemory.h"
32 #include "cintltst.h"
33 #include "citertst.h"
34 #include "ccolltst.h"
35 #include "filestrm.h"
36 #include "cstring.h"
37 #include "ucol_imp.h"
38 #include "ucol_tok.h"
39 #include "uparse.h"
40 #include <stdio.h>
41 
42 extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *);
43 
addCollIterTest(TestNode ** root)44 void addCollIterTest(TestNode** root)
45 {
46     addTest(root, &TestPrevious, "tscoll/citertst/TestPrevious");
47     addTest(root, &TestOffset, "tscoll/citertst/TestOffset");
48     addTest(root, &TestSetText, "tscoll/citertst/TestSetText");
49     addTest(root, &TestMaxExpansion, "tscoll/citertst/TestMaxExpansion");
50     addTest(root, &TestUnicodeChar, "tscoll/citertst/TestUnicodeChar");
51     addTest(root, &TestNormalizedUnicodeChar,
52                                 "tscoll/citertst/TestNormalizedUnicodeChar");
53     addTest(root, &TestNormalization, "tscoll/citertst/TestNormalization");
54     addTest(root, &TestBug672, "tscoll/citertst/TestBug672");
55     addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize");
56     addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer");
57     addTest(root, &TestCEs, "tscoll/citertst/TestCEs");
58     addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos");
59     addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow");
60     addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity");
61     addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity");
62     addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchCollatorElements");
63 }
64 
65 /* The locales we support */
66 
67 static const char * LOCALES[] = {"en_AU", "en_BE", "en_CA"};
68 
TestBug672()69 static void TestBug672() {
70     UErrorCode  status = U_ZERO_ERROR;
71     UChar       pattern[20];
72     UChar       text[50];
73     int         i;
74     int         result[3][3];
75 
76     u_uastrcpy(pattern, "resume");
77     u_uastrcpy(text, "Time to resume updating my resume.");
78 
79     for (i = 0; i < 3; ++ i) {
80         UCollator          *coll = ucol_open(LOCALES[i], &status);
81         UCollationElements *pitr = ucol_openElements(coll, pattern, -1,
82                                                      &status);
83         UCollationElements *titer = ucol_openElements(coll, text, -1,
84                                                      &status);
85         if (U_FAILURE(status)) {
86             log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n",
87                     myErrorName(status));
88             return;
89         }
90 
91         log_verbose("locale tested %s\n", LOCALES[i]);
92 
93         while (ucol_next(pitr, &status) != UCOL_NULLORDER &&
94                U_SUCCESS(status)) {
95         }
96         if (U_FAILURE(status)) {
97             log_err("ERROR: reversing collation iterator :%s\n",
98                     myErrorName(status));
99             return;
100         }
101         ucol_reset(pitr);
102 
103         ucol_setOffset(titer, u_strlen(pattern), &status);
104         if (U_FAILURE(status)) {
105             log_err("ERROR: setting offset in collator :%s\n",
106                     myErrorName(status));
107             return;
108         }
109         result[i][0] = ucol_getOffset(titer);
110         log_verbose("Text iterator set to offset %d\n", result[i][0]);
111 
112         /* Use previous() */
113         ucol_previous(titer, &status);
114         result[i][1] = ucol_getOffset(titer);
115         log_verbose("Current offset %d after previous\n", result[i][1]);
116 
117         /* Add one to index */
118         log_verbose("Adding one to current offset...\n");
119         ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status);
120         if (U_FAILURE(status)) {
121             log_err("ERROR: setting offset in collator :%s\n",
122                     myErrorName(status));
123             return;
124         }
125         result[i][2] = ucol_getOffset(titer);
126         log_verbose("Current offset in text = %d\n", result[i][2]);
127         ucol_closeElements(pitr);
128         ucol_closeElements(titer);
129         ucol_close(coll);
130     }
131 
132     if (uprv_memcmp(result[0], result[1], 3) != 0 ||
133         uprv_memcmp(result[1], result[2], 3) != 0) {
134         log_err("ERROR: Different locales have different offsets at the same character\n");
135     }
136 }
137 
138 
139 
140 /*  Running this test with normalization enabled showed up a bug in the incremental
141     normalization code. */
TestBug672Normalize()142 static void TestBug672Normalize() {
143     UErrorCode  status = U_ZERO_ERROR;
144     UChar       pattern[20];
145     UChar       text[50];
146     int         i;
147     int         result[3][3];
148 
149     u_uastrcpy(pattern, "resume");
150     u_uastrcpy(text, "Time to resume updating my resume.");
151 
152     for (i = 0; i < 3; ++ i) {
153         UCollator          *coll = ucol_open(LOCALES[i], &status);
154         UCollationElements *pitr = NULL;
155         UCollationElements *titer = NULL;
156 
157         ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
158 
159         pitr = ucol_openElements(coll, pattern, -1, &status);
160         titer = ucol_openElements(coll, text, -1, &status);
161         if (U_FAILURE(status)) {
162             log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n",
163                     myErrorName(status));
164             return;
165         }
166 
167         log_verbose("locale tested %s\n", LOCALES[i]);
168 
169         while (ucol_next(pitr, &status) != UCOL_NULLORDER &&
170                U_SUCCESS(status)) {
171         }
172         if (U_FAILURE(status)) {
173             log_err("ERROR: reversing collation iterator :%s\n",
174                     myErrorName(status));
175             return;
176         }
177         ucol_reset(pitr);
178 
179         ucol_setOffset(titer, u_strlen(pattern), &status);
180         if (U_FAILURE(status)) {
181             log_err("ERROR: setting offset in collator :%s\n",
182                     myErrorName(status));
183             return;
184         }
185         result[i][0] = ucol_getOffset(titer);
186         log_verbose("Text iterator set to offset %d\n", result[i][0]);
187 
188         /* Use previous() */
189         ucol_previous(titer, &status);
190         result[i][1] = ucol_getOffset(titer);
191         log_verbose("Current offset %d after previous\n", result[i][1]);
192 
193         /* Add one to index */
194         log_verbose("Adding one to current offset...\n");
195         ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status);
196         if (U_FAILURE(status)) {
197             log_err("ERROR: setting offset in collator :%s\n",
198                     myErrorName(status));
199             return;
200         }
201         result[i][2] = ucol_getOffset(titer);
202         log_verbose("Current offset in text = %d\n", result[i][2]);
203         ucol_closeElements(pitr);
204         ucol_closeElements(titer);
205         ucol_close(coll);
206     }
207 
208     if (uprv_memcmp(result[0], result[1], 3) != 0 ||
209         uprv_memcmp(result[1], result[2], 3) != 0) {
210         log_err("ERROR: Different locales have different offsets at the same character\n");
211     }
212 }
213 
214 
215 
216 
217 /**
218  * Test for CollationElementIterator previous and next for the whole set of
219  * unicode characters.
220  */
TestUnicodeChar()221 static void TestUnicodeChar()
222 {
223     UChar source[0x100];
224     UCollator *en_us;
225     UCollationElements *iter;
226     UErrorCode status = U_ZERO_ERROR;
227     UChar codepoint;
228 
229     UChar *test;
230     en_us = ucol_open("en_US", &status);
231     if (U_FAILURE(status)){
232        log_err_status(status, "ERROR: in creation of collation data using ucol_open()\n %s\n",
233               myErrorName(status));
234        return;
235     }
236 
237     for (codepoint = 1; codepoint < 0xFFFE;)
238     {
239       test = source;
240 
241       while (codepoint % 0xFF != 0)
242       {
243         if (u_isdefined(codepoint))
244           *(test ++) = codepoint;
245         codepoint ++;
246       }
247 
248       if (u_isdefined(codepoint))
249         *(test ++) = codepoint;
250 
251       if (codepoint != 0xFFFF)
252         codepoint ++;
253 
254       *test = 0;
255       iter=ucol_openElements(en_us, source, u_strlen(source), &status);
256       if(U_FAILURE(status)){
257           log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
258               myErrorName(status));
259           ucol_close(en_us);
260           return;
261       }
262       /* A basic test to see if it's working at all */
263       log_verbose("codepoint testing %x\n", codepoint);
264       backAndForth(iter);
265       ucol_closeElements(iter);
266 
267       /* null termination test */
268       iter=ucol_openElements(en_us, source, -1, &status);
269       if(U_FAILURE(status)){
270           log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
271               myErrorName(status));
272           ucol_close(en_us);
273           return;
274       }
275       /* A basic test to see if it's working at all */
276       backAndForth(iter);
277       ucol_closeElements(iter);
278     }
279 
280     ucol_close(en_us);
281 }
282 
283 /**
284  * Test for CollationElementIterator previous and next for the whole set of
285  * unicode characters with normalization on.
286  */
TestNormalizedUnicodeChar()287 static void TestNormalizedUnicodeChar()
288 {
289     UChar source[0x100];
290     UCollator *th_th;
291     UCollationElements *iter;
292     UErrorCode status = U_ZERO_ERROR;
293     UChar codepoint;
294 
295     UChar *test;
296     /* thai should have normalization on */
297     th_th = ucol_open("th_TH", &status);
298     if (U_FAILURE(status)){
299         log_err_status(status, "ERROR: in creation of thai collation using ucol_open()\n %s\n",
300               myErrorName(status));
301         return;
302     }
303 
304     for (codepoint = 1; codepoint < 0xFFFE;)
305     {
306       test = source;
307 
308       while (codepoint % 0xFF != 0)
309       {
310         if (u_isdefined(codepoint))
311           *(test ++) = codepoint;
312         codepoint ++;
313       }
314 
315       if (u_isdefined(codepoint))
316         *(test ++) = codepoint;
317 
318       if (codepoint != 0xFFFF)
319         codepoint ++;
320 
321       *test = 0;
322       iter=ucol_openElements(th_th, source, u_strlen(source), &status);
323       if(U_FAILURE(status)){
324           log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
325               myErrorName(status));
326             ucol_close(th_th);
327           return;
328       }
329 
330       backAndForth(iter);
331       ucol_closeElements(iter);
332 
333       iter=ucol_openElements(th_th, source, -1, &status);
334       if(U_FAILURE(status)){
335           log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
336               myErrorName(status));
337             ucol_close(th_th);
338           return;
339       }
340 
341       backAndForth(iter);
342       ucol_closeElements(iter);
343     }
344 
345     ucol_close(th_th);
346 }
347 
348 /**
349 * Test the incremental normalization
350 */
TestNormalization()351 static void TestNormalization()
352 {
353           UErrorCode          status = U_ZERO_ERROR;
354     const char               *str    =
355                             "&a < \\u0300\\u0315 < A\\u0300\\u0315 < \\u0316\\u0315B < \\u0316\\u0300\\u0315";
356           UCollator          *coll;
357           UChar               rule[50];
358           int                 rulelen = u_unescape(str, rule, 50);
359           int                 count = 0;
360     const char                *testdata[] =
361                         {"\\u1ED9", "o\\u0323\\u0302",
362                         "\\u0300\\u0315", "\\u0315\\u0300",
363                         "A\\u0300\\u0315B", "A\\u0315\\u0300B",
364                         "A\\u0316\\u0315B", "A\\u0315\\u0316B",
365                         "\\u0316\\u0300\\u0315", "\\u0315\\u0300\\u0316",
366                         "A\\u0316\\u0300\\u0315B", "A\\u0315\\u0300\\u0316B",
367                         "\\u0316\\u0315\\u0300", "A\\u0316\\u0315\\u0300B"};
368     int32_t   srclen;
369     UChar source[10];
370     UCollationElements *iter;
371 
372     coll = ucol_openRules(rule, rulelen, UCOL_ON, UCOL_TERTIARY, NULL, &status);
373     ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
374     if (U_FAILURE(status)){
375         log_err_status(status, "ERROR: in creation of collator using ucol_openRules()\n %s\n",
376               myErrorName(status));
377         return;
378     }
379 
380     srclen = u_unescape(testdata[0], source, 10);
381     iter = ucol_openElements(coll, source, srclen, &status);
382     backAndForth(iter);
383     ucol_closeElements(iter);
384 
385     srclen = u_unescape(testdata[1], source, 10);
386     iter = ucol_openElements(coll, source, srclen, &status);
387     backAndForth(iter);
388     ucol_closeElements(iter);
389 
390     while (count < 12) {
391         srclen = u_unescape(testdata[count], source, 10);
392         iter = ucol_openElements(coll, source, srclen, &status);
393 
394         if (U_FAILURE(status)){
395             log_err("ERROR: in creation of collator element iterator\n %s\n",
396                   myErrorName(status));
397             return;
398         }
399         backAndForth(iter);
400         ucol_closeElements(iter);
401 
402         iter = ucol_openElements(coll, source, -1, &status);
403 
404         if (U_FAILURE(status)){
405             log_err("ERROR: in creation of collator element iterator\n %s\n",
406                   myErrorName(status));
407             return;
408         }
409         backAndForth(iter);
410         ucol_closeElements(iter);
411         count ++;
412     }
413     ucol_close(coll);
414 }
415 
416 /**
417  * Test for CollationElementIterator.previous()
418  *
419  * @bug 4108758 - Make sure it works with contracting characters
420  *
421  */
TestPrevious()422 static void TestPrevious()
423 {
424     UCollator *coll=NULL;
425     UChar rule[50];
426     UChar *source;
427     UCollator *c1, *c2, *c3;
428     UCollationElements *iter;
429     UErrorCode status = U_ZERO_ERROR;
430     UChar test1[50];
431     UChar test2[50];
432 
433     u_uastrcpy(test1, "What subset of all possible test cases?");
434     u_uastrcpy(test2, "has the highest probability of detecting");
435     coll = ucol_open("en_US", &status);
436 
437     iter=ucol_openElements(coll, test1, u_strlen(test1), &status);
438     log_verbose("English locale testing back and forth\n");
439     if(U_FAILURE(status)){
440         log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
441             myErrorName(status));
442         ucol_close(coll);
443         return;
444     }
445     /* A basic test to see if it's working at all */
446     backAndForth(iter);
447     ucol_closeElements(iter);
448     ucol_close(coll);
449 
450     /* Test with a contracting character sequence */
451     u_uastrcpy(rule, "&a,A < b,B < c,C, d,D < z,Z < ch,cH,Ch,CH");
452     c1 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status);
453 
454     log_verbose("Contraction rule testing back and forth with no normalization\n");
455 
456     if (c1 == NULL || U_FAILURE(status))
457     {
458         log_err("Couldn't create a RuleBasedCollator with a contracting sequence\n %s\n",
459             myErrorName(status));
460         return;
461     }
462     source=(UChar*)malloc(sizeof(UChar) * 20);
463     u_uastrcpy(source, "abchdcba");
464     iter=ucol_openElements(c1, source, u_strlen(source), &status);
465     if(U_FAILURE(status)){
466         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
467             myErrorName(status));
468         return;
469     }
470     backAndForth(iter);
471     ucol_closeElements(iter);
472     ucol_close(c1);
473 
474     /* Test with an expanding character sequence */
475     u_uastrcpy(rule, "&a < b < c/abd < d");
476     c2 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status);
477     log_verbose("Expansion rule testing back and forth with no normalization\n");
478     if (c2 == NULL || U_FAILURE(status))
479     {
480         log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n",
481             myErrorName(status));
482         return;
483     }
484     u_uastrcpy(source, "abcd");
485     iter=ucol_openElements(c2, source, u_strlen(source), &status);
486     if(U_FAILURE(status)){
487         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
488             myErrorName(status));
489         return;
490     }
491     backAndForth(iter);
492     ucol_closeElements(iter);
493     ucol_close(c2);
494     /* Now try both */
495     u_uastrcpy(rule, "&a < b < c/aba < d < z < ch");
496     c3 = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,  UCOL_DEFAULT_STRENGTH,NULL, &status);
497     log_verbose("Expansion/contraction rule testing back and forth with no normalization\n");
498 
499     if (c3 == NULL || U_FAILURE(status))
500     {
501         log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n",
502             myErrorName(status));
503         return;
504     }
505     u_uastrcpy(source, "abcdbchdc");
506     iter=ucol_openElements(c3, source, u_strlen(source), &status);
507     if(U_FAILURE(status)){
508         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
509             myErrorName(status));
510         return;
511     }
512     backAndForth(iter);
513     ucol_closeElements(iter);
514     ucol_close(c3);
515     source[0] = 0x0e41;
516     source[1] = 0x0e02;
517     source[2] = 0x0e41;
518     source[3] = 0x0e02;
519     source[4] = 0x0e27;
520     source[5] = 0x61;
521     source[6] = 0x62;
522     source[7] = 0x63;
523     source[8] = 0;
524 
525     coll = ucol_open("th_TH", &status);
526     log_verbose("Thai locale testing back and forth with normalization\n");
527     iter=ucol_openElements(coll, source, u_strlen(source), &status);
528     if(U_FAILURE(status)){
529         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
530             myErrorName(status));
531         return;
532     }
533     backAndForth(iter);
534     ucol_closeElements(iter);
535     ucol_close(coll);
536 
537     /* prev test */
538     source[0] = 0x0061;
539     source[1] = 0x30CF;
540     source[2] = 0x3099;
541     source[3] = 0x30FC;
542     source[4] = 0;
543 
544     coll = ucol_open("ja_JP", &status);
545     log_verbose("Japanese locale testing back and forth with normalization\n");
546     iter=ucol_openElements(coll, source, u_strlen(source), &status);
547     if(U_FAILURE(status)){
548         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
549             myErrorName(status));
550         return;
551     }
552     backAndForth(iter);
553     ucol_closeElements(iter);
554     ucol_close(coll);
555 
556     free(source);
557 }
558 
559 /**
560  * Test for getOffset() and setOffset()
561  */
TestOffset()562 static void TestOffset()
563 {
564     UErrorCode status= U_ZERO_ERROR;
565     UCollator *en_us=NULL;
566     UCollationElements *iter, *pristine;
567     int32_t offset;
568     OrderAndOffset *orders;
569     int32_t orderLength=0;
570     int     count = 0;
571     UChar test1[50];
572     UChar test2[50];
573 
574     u_uastrcpy(test1, "What subset of all possible test cases?");
575     u_uastrcpy(test2, "has the highest probability of detecting");
576     en_us = ucol_open("en_US", &status);
577     log_verbose("Testing getOffset and setOffset for collations\n");
578     iter = ucol_openElements(en_us, test1, u_strlen(test1), &status);
579     if(U_FAILURE(status)){
580         log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
581             myErrorName(status));
582         ucol_close(en_us);
583         return;
584     }
585 
586     /* testing boundaries */
587     ucol_setOffset(iter, 0, &status);
588     if (U_FAILURE(status) || ucol_previous(iter, &status) != UCOL_NULLORDER) {
589         log_err("Error: After setting offset to 0, we should be at the end "
590                 "of the backwards iteration");
591     }
592     ucol_setOffset(iter, u_strlen(test1), &status);
593     if (U_FAILURE(status) || ucol_next(iter, &status) != UCOL_NULLORDER) {
594         log_err("Error: After setting offset to end of the string, we should "
595                 "be at the end of the backwards iteration");
596     }
597 
598     /* Run all the way through the iterator, then get the offset */
599 
600     orders = getOrders(iter, &orderLength);
601 
602     offset = ucol_getOffset(iter);
603 
604     if (offset != u_strlen(test1))
605     {
606         log_err("offset at end != length %d vs %d\n", offset,
607             u_strlen(test1) );
608     }
609 
610     /* Now set the offset back to the beginning and see if it works */
611     pristine=ucol_openElements(en_us, test1, u_strlen(test1), &status);
612     if(U_FAILURE(status)){
613         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
614             myErrorName(status));
615     ucol_close(en_us);
616         return;
617     }
618     status = U_ZERO_ERROR;
619 
620     ucol_setOffset(iter, 0, &status);
621     if (U_FAILURE(status))
622     {
623         log_err("setOffset failed. %s\n",    myErrorName(status));
624     }
625     else
626     {
627         assertEqual(iter, pristine);
628     }
629 
630     ucol_closeElements(pristine);
631     ucol_closeElements(iter);
632     free(orders);
633 
634     /* testing offsets in normalization buffer */
635     test1[0] = 0x61;
636     test1[1] = 0x300;
637     test1[2] = 0x316;
638     test1[3] = 0x62;
639     test1[4] = 0;
640     ucol_setAttribute(en_us, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
641     iter = ucol_openElements(en_us, test1, 4, &status);
642     if(U_FAILURE(status)){
643         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
644             myErrorName(status));
645         ucol_close(en_us);
646         return;
647     }
648 
649     count = 0;
650     while (ucol_next(iter, &status) != UCOL_NULLORDER &&
651         U_SUCCESS(status)) {
652         switch (count) {
653         case 0:
654             if (ucol_getOffset(iter) != 1) {
655                 log_err("ERROR: Offset of iteration should be 1\n");
656             }
657             break;
658         case 3:
659             if (ucol_getOffset(iter) != 4) {
660                 log_err("ERROR: Offset of iteration should be 4\n");
661             }
662             break;
663         default:
664             if (ucol_getOffset(iter) != 3) {
665                 log_err("ERROR: Offset of iteration should be 3\n");
666             }
667         }
668         count ++;
669     }
670 
671     ucol_reset(iter);
672     count = 0;
673     while (ucol_previous(iter, &status) != UCOL_NULLORDER &&
674         U_SUCCESS(status)) {
675         switch (count) {
676         case 0:
677         case 1:
678             if (ucol_getOffset(iter) != 3) {
679                 log_err("ERROR: Offset of iteration should be 3\n");
680             }
681             break;
682         case 2:
683             if (ucol_getOffset(iter) != 1) {
684                 log_err("ERROR: Offset of iteration should be 1\n");
685             }
686             break;
687         default:
688             if (ucol_getOffset(iter) != 0) {
689                 log_err("ERROR: Offset of iteration should be 0\n");
690             }
691         }
692         count ++;
693     }
694 
695     if(U_FAILURE(status)){
696         log_err("ERROR: in iterating collation elements %s\n",
697             myErrorName(status));
698     }
699 
700     ucol_closeElements(iter);
701     ucol_close(en_us);
702 }
703 
704 /**
705  * Test for setText()
706  */
TestSetText()707 static void TestSetText()
708 {
709     int32_t c,i;
710     UErrorCode status = U_ZERO_ERROR;
711     UCollator *en_us=NULL;
712     UCollationElements *iter1, *iter2;
713     UChar test1[50];
714     UChar test2[50];
715 
716     u_uastrcpy(test1, "What subset of all possible test cases?");
717     u_uastrcpy(test2, "has the highest probability of detecting");
718     en_us = ucol_open("en_US", &status);
719     log_verbose("testing setText for Collation elements\n");
720     iter1=ucol_openElements(en_us, test1, u_strlen(test1), &status);
721     if(U_FAILURE(status)){
722         log_err_status(status, "ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n",
723             myErrorName(status));
724     ucol_close(en_us);
725         return;
726     }
727     iter2=ucol_openElements(en_us, test2, u_strlen(test2), &status);
728     if(U_FAILURE(status)){
729         log_err("ERROR: in creation of collation element iterator2 using ucol_openElements()\n %s\n",
730             myErrorName(status));
731     ucol_close(en_us);
732         return;
733     }
734 
735     /* Run through the second iterator just to exercise it */
736     c = ucol_next(iter2, &status);
737     i = 0;
738 
739     while ( ++i < 10 && (c != UCOL_NULLORDER))
740     {
741         if (U_FAILURE(status))
742         {
743             log_err("iter2->next() returned an error. %s\n", myErrorName(status));
744             ucol_closeElements(iter2);
745             ucol_closeElements(iter1);
746     ucol_close(en_us);
747             return;
748         }
749 
750         c = ucol_next(iter2, &status);
751     }
752 
753     /* Now set it to point to the same string as the first iterator */
754     ucol_setText(iter2, test1, u_strlen(test1), &status);
755     if (U_FAILURE(status))
756     {
757         log_err("call to iter2->setText(test1) failed. %s\n", myErrorName(status));
758     }
759     else
760     {
761         assertEqual(iter1, iter2);
762     }
763 
764     /* Now set it to point to a null string with fake length*/
765     ucol_setText(iter2, NULL, 2, &status);
766     if (U_FAILURE(status))
767     {
768         log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status));
769     }
770     else
771     {
772         if (ucol_next(iter2, &status) != UCOL_NULLORDER) {
773             log_err("iter2 with null text expected to return UCOL_NULLORDER\n");
774         }
775     }
776 
777     ucol_closeElements(iter2);
778     ucol_closeElements(iter1);
779     ucol_close(en_us);
780 }
781 
782 /** @bug 4108762
783  * Test for getMaxExpansion()
784  */
TestMaxExpansion()785 static void TestMaxExpansion()
786 {
787     UErrorCode          status = U_ZERO_ERROR;
788     UCollator          *coll   ;/*= ucol_open("en_US", &status);*/
789     UChar               ch     = 0;
790     UChar32             unassigned = 0xEFFFD;
791     UChar               supplementary[2];
792     uint32_t            stringOffset = 0;
793     UBool               isError = FALSE;
794     uint32_t            sorder = 0;
795     UCollationElements *iter   ;/*= ucol_openElements(coll, &ch, 1, &status);*/
796     uint32_t            temporder = 0;
797 
798     UChar rule[256];
799     u_uastrcpy(rule, "&a < ab < c/aba < d < z < ch");
800     coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,
801         UCOL_DEFAULT_STRENGTH,NULL, &status);
802     if(U_SUCCESS(status) && coll) {
803       iter = ucol_openElements(coll, &ch, 1, &status);
804 
805       while (ch < 0xFFFF && U_SUCCESS(status)) {
806           int      count = 1;
807           uint32_t order;
808           int32_t  size = 0;
809 
810           ch ++;
811 
812           ucol_setText(iter, &ch, 1, &status);
813           order = ucol_previous(iter, &status);
814 
815           /* thai management */
816           if (order == 0)
817               order = ucol_previous(iter, &status);
818 
819           while (U_SUCCESS(status) &&
820               ucol_previous(iter, &status) != UCOL_NULLORDER) {
821               count ++;
822           }
823 
824           size = ucol_getMaxExpansion(iter, order);
825           if (U_FAILURE(status) || size < count) {
826               log_err("Failure at codepoint %d, maximum expansion count < %d\n",
827                   ch, count);
828           }
829       }
830 
831       /* testing for exact max expansion */
832       ch = 0;
833       while (ch < 0x61) {
834           uint32_t order;
835           int32_t  size;
836           ucol_setText(iter, &ch, 1, &status);
837           order = ucol_previous(iter, &status);
838           size  = ucol_getMaxExpansion(iter, order);
839           if (U_FAILURE(status) || size != 1) {
840               log_err("Failure at codepoint %d, maximum expansion count < %d\n",
841                   ch, 1);
842           }
843           ch ++;
844       }
845 
846       ch = 0x63;
847       ucol_setText(iter, &ch, 1, &status);
848       temporder = ucol_previous(iter, &status);
849 
850       if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 3) {
851           log_err("Failure at codepoint %d, maximum expansion count != %d\n",
852                   ch, 3);
853       }
854 
855       ch = 0x64;
856       ucol_setText(iter, &ch, 1, &status);
857       temporder = ucol_previous(iter, &status);
858 
859       if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 1) {
860           log_err("Failure at codepoint %d, maximum expansion count != %d\n",
861                   ch, 3);
862       }
863 
864       U16_APPEND(supplementary, stringOffset, 2, unassigned, isError);
865       ucol_setText(iter, supplementary, 2, &status);
866       sorder = ucol_previous(iter, &status);
867 
868       if (U_FAILURE(status) || ucol_getMaxExpansion(iter, sorder) != 2) {
869           log_err("Failure at codepoint %d, maximum expansion count < %d\n",
870                   ch, 2);
871       }
872 
873       /* testing jamo */
874       ch = 0x1165;
875 
876       ucol_setText(iter, &ch, 1, &status);
877       temporder = ucol_previous(iter, &status);
878       if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) > 3) {
879           log_err("Failure at codepoint %d, maximum expansion count > %d\n",
880                   ch, 3);
881       }
882 
883       ucol_closeElements(iter);
884       ucol_close(coll);
885 
886       /* testing special jamo &a<\u1160 */
887       rule[0] = 0x26;
888       rule[1] = 0x71;
889       rule[2] = 0x3c;
890       rule[3] = 0x1165;
891       rule[4] = 0x2f;
892       rule[5] = 0x71;
893       rule[6] = 0x71;
894       rule[7] = 0x71;
895       rule[8] = 0x71;
896       rule[9] = 0;
897 
898       coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,
899           UCOL_DEFAULT_STRENGTH,NULL, &status);
900       iter = ucol_openElements(coll, &ch, 1, &status);
901 
902       temporder = ucol_previous(iter, &status);
903       if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 6) {
904           log_err("Failure at codepoint %d, maximum expansion count > %d\n",
905                   ch, 5);
906       }
907 
908       ucol_closeElements(iter);
909       ucol_close(coll);
910     } else {
911       log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status));
912     }
913 
914 }
915 
916 
assertEqual(UCollationElements * i1,UCollationElements * i2)917 static void assertEqual(UCollationElements *i1, UCollationElements *i2)
918 {
919     int32_t c1, c2;
920     int32_t count = 0;
921     UErrorCode status = U_ZERO_ERROR;
922 
923     do
924     {
925         c1 = ucol_next(i1, &status);
926         c2 = ucol_next(i2, &status);
927 
928         if (c1 != c2)
929         {
930             log_err("Error in iteration %d assetEqual between\n  %d  and   %d, they are not equal\n", count, c1, c2);
931             break;
932         }
933 
934         count += 1;
935     }
936     while (c1 != UCOL_NULLORDER);
937 }
938 
939 /**
940  * Testing iterators with extremely small buffers
941  */
TestSmallBuffer()942 static void TestSmallBuffer()
943 {
944     UErrorCode          status = U_ZERO_ERROR;
945     UCollator          *coll;
946     UCollationElements *testiter,
947                        *iter;
948     int32_t             count = 0;
949     OrderAndOffset     *testorders,
950                        *orders;
951 
952     UChar teststr[500];
953     UChar str[] = {0x300, 0x31A, 0};
954     /*
955     creating a long string of decomposable characters,
956     since by default the writable buffer is of size 256
957     */
958     while (count < 500) {
959         if ((count & 1) == 0) {
960             teststr[count ++] = 0x300;
961         }
962         else {
963             teststr[count ++] = 0x31A;
964         }
965     }
966 
967     coll = ucol_open("th_TH", &status);
968     if(U_SUCCESS(status) && coll) {
969       testiter = ucol_openElements(coll, teststr, 500, &status);
970       iter = ucol_openElements(coll, str, 2, &status);
971 
972       orders     = getOrders(iter, &count);
973       if (count != 2) {
974           log_err("Error collation elements size is not 2 for \\u0300\\u031A\n");
975       }
976 
977       /*
978       this will rearrange the string data to 250 characters of 0x300 first then
979       250 characters of 0x031A
980       */
981       testorders = getOrders(testiter, &count);
982 
983       if (count != 500) {
984           log_err("Error decomposition does not give the right sized collation elements\n");
985       }
986 
987       while (count != 0) {
988           /* UCA collation element for 0x0F76 */
989           if ((count > 250 && testorders[-- count].order != orders[1].order) ||
990               (count <= 250 && testorders[-- count].order != orders[0].order)) {
991               log_err("Error decomposition does not give the right collation element at %d count\n", count);
992               break;
993           }
994       }
995 
996       free(testorders);
997       free(orders);
998 
999       ucol_reset(testiter);
1000 
1001       /* ensures closing of elements done properly to clear writable buffer */
1002       ucol_next(testiter, &status);
1003       ucol_next(testiter, &status);
1004       ucol_closeElements(testiter);
1005       ucol_closeElements(iter);
1006       ucol_close(coll);
1007     } else {
1008       log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status));
1009     }
1010 }
1011 
1012 /**
1013 * Sniplets of code from genuca
1014 */
hex2num(char hex)1015 static int32_t hex2num(char hex) {
1016     if(hex>='0' && hex <='9') {
1017         return hex-'0';
1018     } else if(hex>='a' && hex<='f') {
1019         return hex-'a'+10;
1020     } else if(hex>='A' && hex<='F') {
1021         return hex-'A'+10;
1022     } else {
1023         return 0;
1024     }
1025 }
1026 
1027 /**
1028 * Getting codepoints from a string
1029 * @param str character string contain codepoints seperated by space and ended
1030 *        by a semicolon
1031 * @param codepoints array for storage, assuming size > 5
1032 * @return position at the end of the codepoint section
1033 */
getCodePoints(char * str,UChar * codepoints,UChar * contextCPs)1034 static char *getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) {
1035     UErrorCode errorCode = U_ZERO_ERROR;
1036     char *semi = uprv_strchr(str, ';');
1037     char *pipe = uprv_strchr(str, '|');
1038     char *s;
1039     *codepoints = 0;
1040     *contextCPs = 0;
1041     if(semi == NULL) {
1042         log_err("expected semicolon after code point string in FractionalUCA.txt %s\n", str);
1043         return str;
1044     }
1045     if(pipe != NULL) {
1046         int32_t contextLength;
1047         *pipe = 0;
1048         contextLength = u_parseString(str, contextCPs, 99, NULL, &errorCode);
1049         *pipe = '|';
1050         if(U_FAILURE(errorCode)) {
1051             log_err("error parsing precontext string from FractionalUCA.txt %s\n", str);
1052             return str;
1053         }
1054         /* prepend the precontext string to the codepoints */
1055         u_memcpy(codepoints, contextCPs, contextLength);
1056         codepoints += contextLength;
1057         /* start of the code point string */
1058         s = pipe + 1;
1059     } else {
1060         s = str;
1061     }
1062     u_parseString(s, codepoints, 99, NULL, &errorCode);
1063     if(U_FAILURE(errorCode)) {
1064         log_err("error parsing code point string from FractionalUCA.txt %s\n", str);
1065         return str;
1066     }
1067     return semi + 1;
1068 }
1069 
1070 /**
1071 * Sniplets of code from genuca
1072 */
1073 static int32_t
readElement(char ** from,char * to,char separator,UErrorCode * status)1074 readElement(char **from, char *to, char separator, UErrorCode *status)
1075 {
1076     if (U_SUCCESS(*status)) {
1077         char    buffer[1024];
1078         int32_t i = 0;
1079         while (**from != separator) {
1080             if (**from != ' ') {
1081                 *(buffer+i++) = **from;
1082             }
1083             (*from)++;
1084         }
1085         (*from)++;
1086         *(buffer + i) = 0;
1087         strcpy(to, buffer);
1088         return i/2;
1089     }
1090 
1091     return 0;
1092 }
1093 
1094 /**
1095 * Sniplets of code from genuca
1096 */
1097 static uint32_t
getSingleCEValue(char * primary,char * secondary,char * tertiary,UErrorCode * status)1098 getSingleCEValue(char *primary, char *secondary, char *tertiary,
1099                           UErrorCode *status)
1100 {
1101     if (U_SUCCESS(*status)) {
1102         uint32_t  value    = 0;
1103         char      primsave = '\0';
1104         char      secsave  = '\0';
1105         char      tersave  = '\0';
1106         char     *primend  = primary+4;
1107         char     *secend   = secondary+2;
1108         char     *terend   = tertiary+2;
1109         uint32_t  primvalue;
1110         uint32_t  secvalue;
1111         uint32_t  tervalue;
1112 
1113         if (uprv_strlen(primary) > 4) {
1114             primsave = *primend;
1115             *primend = '\0';
1116         }
1117 
1118         if (uprv_strlen(secondary) > 2) {
1119             secsave = *secend;
1120             *secend = '\0';
1121         }
1122 
1123         if (uprv_strlen(tertiary) > 2) {
1124             tersave = *terend;
1125             *terend = '\0';
1126         }
1127 
1128         primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0;
1129         secvalue  = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0;
1130         tervalue  = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0;
1131         if(primvalue <= 0xFF) {
1132           primvalue <<= 8;
1133         }
1134 
1135         value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK)
1136            | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK)
1137            | (tervalue & UCOL_TERTIARYORDERMASK);
1138 
1139         if(primsave!='\0') {
1140             *primend = primsave;
1141         }
1142         if(secsave!='\0') {
1143             *secend = secsave;
1144         }
1145         if(tersave!='\0') {
1146             *terend = tersave;
1147         }
1148         return value;
1149     }
1150     return 0;
1151 }
1152 
1153 /**
1154 * Getting collation elements generated from a string
1155 * @param str character string contain collation elements contained in [] and
1156 *        seperated by space
1157 * @param ce array for storage, assuming size > 20
1158 * @param status error status
1159 * @return position at the end of the codepoint section
1160 */
getCEs(char * str,uint32_t * ces,UErrorCode * status)1161 static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) {
1162     char       *pStartCP     = uprv_strchr(str, '[');
1163     int         count        = 0;
1164     char       *pEndCP;
1165     char        primary[100];
1166     char        secondary[100];
1167     char        tertiary[100];
1168 
1169     while (*pStartCP == '[') {
1170         uint32_t primarycount   = 0;
1171         uint32_t secondarycount = 0;
1172         uint32_t tertiarycount  = 0;
1173         uint32_t CEi = 1;
1174         pEndCP = strchr(pStartCP, ']');
1175         if(pEndCP == NULL) {
1176             break;
1177         }
1178         pStartCP ++;
1179 
1180         primarycount   = readElement(&pStartCP, primary, ',', status);
1181         secondarycount = readElement(&pStartCP, secondary, ',', status);
1182         tertiarycount  = readElement(&pStartCP, tertiary, ']', status);
1183 
1184         /* I want to get the CEs entered right here, including continuation */
1185         ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status);
1186         if (U_FAILURE(*status)) {
1187             break;
1188         }
1189 
1190         while (2 * CEi < primarycount || CEi < secondarycount ||
1191                CEi < tertiarycount) {
1192             uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
1193             if (2 * CEi < primarycount) {
1194                 value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28);
1195                 value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24);
1196             }
1197 
1198             if (2 * CEi + 1 < primarycount) {
1199                 value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20);
1200                 value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16);
1201             }
1202 
1203             if (CEi < secondarycount) {
1204                 value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12);
1205                 value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8);
1206             }
1207 
1208             if (CEi < tertiarycount) {
1209                 value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4);
1210                 value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF);
1211             }
1212 
1213             CEi ++;
1214             ces[count ++] = value;
1215         }
1216 
1217       pStartCP = pEndCP + 1;
1218     }
1219     ces[count] = 0;
1220     return pStartCP;
1221 }
1222 
1223 /**
1224 * Getting the FractionalUCA.txt file stream
1225 */
getFractionalUCA(void)1226 static FileStream * getFractionalUCA(void)
1227 {
1228     char        newPath[256];
1229     char        backupPath[256];
1230     FileStream *result = NULL;
1231 
1232     /* Look inside ICU_DATA first */
1233     uprv_strcpy(newPath, ctest_dataSrcDir());
1234     uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING );
1235     uprv_strcat(newPath, "FractionalUCA.txt");
1236 
1237     /* As a fallback, try to guess where the source data was located
1238      *   at the time ICU was built, and look there.
1239      */
1240 #if defined (U_TOPSRCDIR)
1241     strcpy(backupPath, U_TOPSRCDIR  U_FILE_SEP_STRING "data");
1242 #else
1243     {
1244         UErrorCode errorCode = U_ZERO_ERROR;
1245         strcpy(backupPath, loadTestData(&errorCode));
1246         strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data");
1247     }
1248 #endif
1249     strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "FractionalUCA.txt");
1250 
1251     result = T_FileStream_open(newPath, "rb");
1252 
1253     if (result == NULL) {
1254         result = T_FileStream_open(backupPath, "rb");
1255         if (result == NULL) {
1256             log_err("Failed to open either %s or %s\n", newPath, backupPath);
1257         }
1258     }
1259     return result;
1260 }
1261 
1262 /**
1263 * Testing the CEs returned by the iterator
1264 */
TestCEs()1265 static void TestCEs() {
1266     FileStream *file = NULL;
1267     char        line[2048];
1268     char       *str;
1269     UChar       codepoints[10];
1270     uint32_t    ces[20];
1271     UErrorCode  status = U_ZERO_ERROR;
1272     UCollator          *coll = ucol_open("", &status);
1273     uint32_t lineNo = 0;
1274     UChar       contextCPs[5];
1275 
1276     if (U_FAILURE(status)) {
1277         log_err_status(status, "Error in opening root collator -> %s\n", u_errorName(status));
1278         return;
1279     }
1280 
1281     file = getFractionalUCA();
1282 
1283     if (file == NULL) {
1284         log_err("*** unable to open input FractionalUCA.txt file ***\n");
1285         return;
1286     }
1287 
1288 
1289     while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1290         int                 count = 0;
1291         UCollationElements *iter;
1292         int32_t            preContextCeLen=0;
1293         lineNo++;
1294         /* skip this line if it is empty or a comment or is a return value
1295         or start of some variable section */
1296         if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1297             line[0] == 0x000D || line[0] == '[') {
1298             continue;
1299         }
1300 
1301         str = getCodePoints(line, codepoints, contextCPs);
1302 
1303         /* these are 'fake' codepoints in the fractional UCA, and are used just
1304          * for positioning of indirect values. They should not go through this
1305          * test.
1306          */
1307         if(*codepoints == 0xFDD0) {
1308           continue;
1309         }
1310         if (*contextCPs != 0) {
1311             iter = ucol_openElements(coll, contextCPs, -1, &status);
1312             if (U_FAILURE(status)) {
1313                 log_err("Error in opening collation elements\n");
1314                 break;
1315             }
1316             while((ces[preContextCeLen] = ucol_next(iter, &status)) != (uint32_t)UCOL_NULLORDER) {
1317                 preContextCeLen++;
1318             }
1319             ucol_closeElements(iter);
1320         }
1321 
1322         getCEs(str, ces+preContextCeLen, &status);
1323         if (U_FAILURE(status)) {
1324             log_err("Error in parsing collation elements in FractionalUCA.txt\n");
1325             break;
1326         }
1327         iter = ucol_openElements(coll, codepoints, -1, &status);
1328         if (U_FAILURE(status)) {
1329             log_err("Error in opening collation elements\n");
1330             break;
1331         }
1332         for (;;) {
1333             uint32_t ce = (uint32_t)ucol_next(iter, &status);
1334             if (ce == 0xFFFFFFFF) {
1335                 ce = 0;
1336             }
1337             /* we now unconditionally reorder Thai/Lao prevowels, so this
1338              * test would fail if we don't skip here.
1339              */
1340             if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) {
1341               continue;
1342             }
1343             if (ce != ces[count] || U_FAILURE(status)) {
1344                 log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n");
1345                 break;
1346             }
1347             if (ces[count] == 0) {
1348                 break;
1349             }
1350             count ++;
1351         }
1352         ucol_closeElements(iter);
1353     }
1354 
1355     T_FileStream_close(file);
1356     ucol_close(coll);
1357 }
1358 
1359 /**
1360 * Testing the discontigous contractions
1361 */
TestDiscontiguos()1362 static void TestDiscontiguos() {
1363     const char               *rulestr    =
1364                             "&z < AB < X\\u0300 < ABC < X\\u0300\\u0315";
1365           UChar               rule[50];
1366           int                 rulelen = u_unescape(rulestr, rule, 50);
1367     const char               *src[] = {
1368      "ADB", "ADBC", "A\\u0315B", "A\\u0315BC",
1369     /* base character blocked */
1370      "XD\\u0300", "XD\\u0300\\u0315",
1371     /* non blocking combining character */
1372      "X\\u0319\\u0300", "X\\u0319\\u0300\\u0315",
1373      /* blocking combining character */
1374      "X\\u0314\\u0300", "X\\u0314\\u0300\\u0315",
1375      /* contraction prefix */
1376      "ABDC", "AB\\u0315C","X\\u0300D\\u0315", "X\\u0300\\u0319\\u0315",
1377      "X\\u0300\\u031A\\u0315",
1378      /* ends not with a contraction character */
1379      "X\\u0319\\u0300D", "X\\u0319\\u0300\\u0315D", "X\\u0300D\\u0315D",
1380      "X\\u0300\\u0319\\u0315D", "X\\u0300\\u031A\\u0315D"
1381     };
1382     const char               *tgt[] = {
1383      /* non blocking combining character */
1384      "A D B", "A D BC", "A \\u0315 B", "A \\u0315 BC",
1385     /* base character blocked */
1386      "X D \\u0300", "X D \\u0300\\u0315",
1387     /* non blocking combining character */
1388      "X\\u0300 \\u0319", "X\\u0300\\u0315 \\u0319",
1389      /* blocking combining character */
1390      "X \\u0314 \\u0300", "X \\u0314 \\u0300\\u0315",
1391      /* contraction prefix */
1392      "AB DC", "AB \\u0315 C","X\\u0300 D \\u0315", "X\\u0300\\u0315 \\u0319",
1393      "X\\u0300 \\u031A \\u0315",
1394      /* ends not with a contraction character */
1395      "X\\u0300 \\u0319D", "X\\u0300\\u0315 \\u0319D", "X\\u0300 D\\u0315D",
1396      "X\\u0300\\u0315 \\u0319D", "X\\u0300 \\u031A\\u0315D"
1397     };
1398           int                 size   = 20;
1399           UCollator          *coll;
1400           UErrorCode          status    = U_ZERO_ERROR;
1401           int                 count     = 0;
1402           UCollationElements *iter;
1403           UCollationElements *resultiter;
1404 
1405     coll       = ucol_openRules(rule, rulelen, UCOL_OFF, UCOL_DEFAULT_STRENGTH,NULL, &status);
1406     iter       = ucol_openElements(coll, rule, 1, &status);
1407     resultiter = ucol_openElements(coll, rule, 1, &status);
1408 
1409     if (U_FAILURE(status)) {
1410         log_err_status(status, "Error opening collation rules -> %s\n", u_errorName(status));
1411         return;
1412     }
1413 
1414     while (count < size) {
1415         UChar  str[20];
1416         UChar  tstr[20];
1417         int    strLen = u_unescape(src[count], str, 20);
1418         UChar *s;
1419 
1420         ucol_setText(iter, str, strLen, &status);
1421         if (U_FAILURE(status)) {
1422             log_err("Error opening collation iterator\n");
1423             return;
1424         }
1425 
1426         u_unescape(tgt[count], tstr, 20);
1427         s = tstr;
1428 
1429         log_verbose("count %d\n", count);
1430 
1431         for (;;) {
1432             uint32_t  ce;
1433             UChar    *e = u_strchr(s, 0x20);
1434             if (e == 0) {
1435                 e = u_strchr(s, 0);
1436             }
1437             ucol_setText(resultiter, s, (int32_t)(e - s), &status);
1438             ce = ucol_next(resultiter, &status);
1439             if (U_FAILURE(status)) {
1440                 log_err("Error manipulating collation iterator\n");
1441                 return;
1442             }
1443             while (ce != UCOL_NULLORDER) {
1444                 if (ce != (uint32_t)ucol_next(iter, &status) ||
1445                     U_FAILURE(status)) {
1446                     log_err("Discontiguos contraction test mismatch\n");
1447                     return;
1448                 }
1449                 ce = ucol_next(resultiter, &status);
1450                 if (U_FAILURE(status)) {
1451                     log_err("Error getting next collation element\n");
1452                     return;
1453                 }
1454             }
1455             s = e + 1;
1456             if (*e == 0) {
1457                 break;
1458             }
1459         }
1460         ucol_reset(iter);
1461         backAndForth(iter);
1462         count ++;
1463     }
1464     ucol_closeElements(resultiter);
1465     ucol_closeElements(iter);
1466     ucol_close(coll);
1467 }
1468 
TestCEBufferOverflow()1469 static void TestCEBufferOverflow()
1470 {
1471     UChar               str[UCOL_EXPAND_CE_BUFFER_SIZE + 1];
1472     UErrorCode          status = U_ZERO_ERROR;
1473     UChar               rule[10];
1474     UCollator          *coll;
1475     UCollationElements *iter;
1476 
1477     u_uastrcpy(rule, "&z < AB");
1478     coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status);
1479     if (U_FAILURE(status)) {
1480         log_err_status(status, "Rule based collator not created for testing ce buffer overflow -> %s\n", u_errorName(status));
1481         return;
1482     }
1483 
1484     /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic
1485     test. this will cause an overflow in getPrev */
1486     str[0] = 0x0041;    /* 'A' */
1487     /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/
1488     uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);
1489     str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042;   /* 'B' */
1490     iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1,
1491                              &status);
1492     if (ucol_previous(iter, &status) == UCOL_NULLORDER ||
1493         status == U_BUFFER_OVERFLOW_ERROR) {
1494         log_err("CE buffer should not overflow with long string of trail surrogates\n");
1495     }
1496     ucol_closeElements(iter);
1497     ucol_close(coll);
1498 }
1499 
1500 /**
1501 * Checking collation element validity.
1502 */
1503 #define MAX_CODEPOINTS_TO_SHOW 10
showCodepoints(const UChar * codepoints,int length,char * codepointText)1504 static void showCodepoints(const UChar *codepoints, int length, char * codepointText) {
1505     int i, lengthToUse = length;
1506     if (lengthToUse > MAX_CODEPOINTS_TO_SHOW) {
1507         lengthToUse = MAX_CODEPOINTS_TO_SHOW;
1508     }
1509     for (i = 0; i < lengthToUse; ++i) {
1510         int bytesWritten = sprintf(codepointText, " %04X", *codepoints++);
1511         if (bytesWritten <= 0) {
1512             break;
1513         }
1514         codepointText += bytesWritten;
1515     }
1516     if (i < length) {
1517         sprintf(codepointText, " ...");
1518     }
1519 }
1520 
checkCEValidity(const UCollator * coll,const UChar * codepoints,int length)1521 static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
1522                              int length)
1523 {
1524     UErrorCode          status = U_ZERO_ERROR;
1525     UCollationElements *iter   = ucol_openElements(coll, codepoints, length,
1526                                                   &status);
1527     UBool result = FALSE;
1528     UBool primaryDone = FALSE, secondaryDone = FALSE, tertiaryDone = FALSE;
1529     const char * collLocale;
1530 
1531     if (U_FAILURE(status)) {
1532         log_err("Error creating iterator for testing validity\n");
1533         return FALSE;
1534     }
1535     collLocale = ucol_getLocale(coll, ULOC_VALID_LOCALE, &status);
1536     if (U_FAILURE(status) || collLocale==NULL) {
1537         status = U_ZERO_ERROR;
1538         collLocale = "?";
1539     }
1540 
1541     for (;;) {
1542         uint32_t ce = ucol_next(iter, &status);
1543         uint32_t primary, p1, p2, secondary, tertiary;
1544         if (ce == UCOL_NULLORDER) {
1545             result = TRUE;
1546             break;
1547         }
1548         if (ce == 0) {
1549             continue;
1550         }
1551         if (ce == 0x02000202) {
1552             /* special CE for merge-sort character */
1553             if (*codepoints == 0xFFFE /* && length == 1 */) {
1554                 /*
1555                  * Note: We should check for length==1 but the token parser appears
1556                  * to give us trailing NUL characters.
1557                  * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet()
1558                  *                     rather than the internal collation rule parser
1559                  */
1560                 continue;
1561             } else {
1562                 log_err("Special 02/02/02 weight for code point U+%04X [len %d] != U+FFFE\n",
1563                         (int)*codepoints, (int)length);
1564                 break;
1565             }
1566         }
1567         primary   = UCOL_PRIMARYORDER(ce);
1568         p1 = primary >> 8;
1569         p2 = primary & 0xFF;
1570         secondary = UCOL_SECONDARYORDER(ce);
1571         tertiary  = UCOL_TERTIARYORDER(ce) & UCOL_REMOVE_CONTINUATION;
1572 
1573         if (!isContinuation(ce)) {
1574             if ((ce & UCOL_REMOVE_CONTINUATION) == 0) {
1575                 log_err("Empty CE %08lX except for case bits\n", (long)ce);
1576                 break;
1577             }
1578             if (p1 == 0) {
1579                 if (p2 != 0) {
1580                     log_err("Primary 00 xx in %08lX\n", (long)ce);
1581                     break;
1582                 }
1583                 primaryDone = TRUE;
1584             } else {
1585                 if (p1 <= 2 || p1 >= 0xF0) {
1586                     /* Primary first bytes F0..FF are specials. */
1587                     log_err("Primary first byte of %08lX out of range\n", (long)ce);
1588                     break;
1589                 }
1590                 if (p2 == 0) {
1591                     primaryDone = TRUE;
1592                 } else {
1593                     if (p2 <= 3 || p2 >= 0xFF) {
1594                         /* Primary second bytes 03 and FF are sort key compression terminators. */
1595                         log_err("Primary second byte of %08lX out of range\n", (long)ce);
1596                         break;
1597                     }
1598                     primaryDone = FALSE;
1599                 }
1600             }
1601             if (secondary == 0) {
1602                 if (primary != 0) {
1603                     log_err("Primary!=0 secondary==0 in %08lX\n", (long)ce);
1604                     break;
1605                 }
1606                 secondaryDone = TRUE;
1607             } else {
1608                 if (secondary <= 2 ||
1609                     (UCOL_BYTE_COMMON < secondary && secondary <= (UCOL_BYTE_COMMON + 0x80))
1610                 ) {
1611                     /* Secondary first bytes common+1..+0x80 are used for sort key compression. */
1612                     log_err("Secondary byte of %08lX out of range\n", (long)ce);
1613                     break;
1614                 }
1615                 secondaryDone = FALSE;
1616             }
1617             if (tertiary == 0) {
1618                 /* We know that ce != 0. */
1619                 log_err("Primary!=0 or secondary!=0 but tertiary==0 in %08lX\n", (long)ce);
1620                 break;
1621             }
1622             if (tertiary <= 2) {
1623                 log_err("Tertiary byte of %08lX out of range\n", (long)ce);
1624                 break;
1625             }
1626             tertiaryDone = FALSE;
1627         } else {
1628             if ((ce & UCOL_REMOVE_CONTINUATION) == 0) {
1629                 log_err("Empty continuation %08lX\n", (long)ce);
1630                 break;
1631             }
1632             if (primaryDone && primary != 0) {
1633                 log_err("Primary was done but continues in %08lX\n", (long)ce);
1634                 break;
1635             }
1636             if (p1 == 0) {
1637                 if (p2 != 0) {
1638                     log_err("Primary 00 xx in %08lX\n", (long)ce);
1639                     break;
1640                 }
1641                 primaryDone = TRUE;
1642             } else {
1643                 if (p1 <= 2) {
1644                     log_err("Primary first byte of %08lX out of range\n", (long)ce);
1645                     break;
1646                 }
1647                 if (p2 == 0) {
1648                     primaryDone = TRUE;
1649                 } else {
1650                     if (p2 <= 3) {
1651                         log_err("Primary second byte of %08lX out of range\n", (long)ce);
1652                         break;
1653                     }
1654                 }
1655             }
1656             if (secondaryDone && secondary != 0) {
1657                 log_err("Secondary was done but continues in %08lX\n", (long)ce);
1658                 break;
1659             }
1660             if (secondary == 0) {
1661                 secondaryDone = TRUE;
1662             } else {
1663                 if (secondary <= 2) {
1664                     log_err("Secondary byte of %08lX out of range\n", (long)ce);
1665                     break;
1666                 }
1667             }
1668             if (tertiaryDone && tertiary != 0) {
1669                 log_err("Tertiary was done but continues in %08lX\n", (long)ce);
1670                 break;
1671             }
1672             if (tertiary == 0) {
1673                 tertiaryDone = TRUE;
1674             } else if (tertiary <= 2) {
1675                 log_err("Tertiary byte of %08lX out of range\n", (long)ce);
1676                 break;
1677             }
1678         }
1679     }
1680     if (!result) {
1681         char codepointText[5*MAX_CODEPOINTS_TO_SHOW + 5];
1682         showCodepoints(codepoints, length, codepointText);
1683         log_err("Locale: %s  Code point string: %s\n", collLocale, codepointText);
1684     }
1685     ucol_closeElements(iter);
1686     return result;
1687 }
1688 
1689 static const UChar IMPORT[] = { 0x5B, 0x69, 0x6D, 0x70, 0x6F, 0x72, 0x74, 0 };  /* "[import" */
1690 
TestCEValidity()1691 static void TestCEValidity()
1692 {
1693     /* testing UCA collation elements */
1694     UErrorCode  status      = U_ZERO_ERROR;
1695     /* en_US has no tailorings */
1696     UCollator  *coll        = ucol_open("root", &status);
1697     /* tailored locales */
1698     char        locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"};
1699     const char *loc;
1700     FileStream *file = NULL;
1701     char        line[2048];
1702     UChar       codepoints[11];
1703     int         count = 0;
1704     int         maxCount = 0;
1705     UChar       contextCPs[3];
1706     UChar32     c;
1707     UParseError parseError;
1708     if (U_FAILURE(status)) {
1709         log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status));
1710         return;
1711     }
1712     log_verbose("Testing UCA elements\n");
1713     file = getFractionalUCA();
1714     if (file == NULL) {
1715         log_err("Fractional UCA data can not be opened\n");
1716         return;
1717     }
1718 
1719     while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1720         if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1721             line[0] == 0x000D || line[0] == '[') {
1722             continue;
1723         }
1724 
1725         getCodePoints(line, codepoints, contextCPs);
1726         checkCEValidity(coll, codepoints, u_strlen(codepoints));
1727     }
1728 
1729     log_verbose("Testing UCA elements for the whole range of unicode characters\n");
1730     for (c = 0; c <= 0xffff; ++c) {
1731         if (u_isdefined(c)) {
1732             codepoints[0] = (UChar)c;
1733             checkCEValidity(coll, codepoints, 1);
1734         }
1735     }
1736     for (; c <= 0x10ffff; ++c) {
1737         if (u_isdefined(c)) {
1738             int32_t i = 0;
1739             U16_APPEND_UNSAFE(codepoints, i, c);
1740             checkCEValidity(coll, codepoints, i);
1741         }
1742     }
1743 
1744     ucol_close(coll);
1745 
1746     /* testing tailored collation elements */
1747     log_verbose("Testing tailored elements\n");
1748     if(getTestOption(QUICK_OPTION)) {
1749         maxCount = sizeof(locale)/sizeof(locale[0]);
1750     } else {
1751         maxCount = uloc_countAvailable();
1752     }
1753     while (count < maxCount) {
1754         const UChar *rules = NULL,
1755                     *current = NULL;
1756         UChar *rulesCopy = NULL;
1757         int32_t ruleLen = 0;
1758 
1759         uint32_t chOffset = 0;
1760         uint32_t chLen = 0;
1761         uint32_t exOffset = 0;
1762         uint32_t exLen = 0;
1763         uint32_t prefixOffset = 0;
1764         uint32_t prefixLen = 0;
1765         UBool    startOfRules = TRUE;
1766         UColOptionSet opts;
1767 
1768         UColTokenParser src;
1769         uint32_t strength = 0;
1770         uint16_t specs = 0;
1771         if(getTestOption(QUICK_OPTION)) {
1772             loc = locale[count];
1773         } else {
1774             loc = uloc_getAvailable(count);
1775             if(!hasCollationElements(loc)) {
1776                 count++;
1777                 continue;
1778             }
1779         }
1780         status = U_ZERO_ERROR; // clear status from previous loop iteration
1781 
1782         uprv_memset(&src, 0, sizeof(UColTokenParser));
1783 
1784         log_verbose("Testing CEs for %s\n", loc);
1785 
1786         coll      = ucol_open(loc, &status);
1787         if (U_FAILURE(status)) {
1788             log_err("%s collator creation failed with status %s\n", loc, u_errorName(status));
1789             return;
1790         }
1791 
1792         src.opts = &opts;
1793         rules = ucol_getRules(coll, &ruleLen);
1794 
1795         /*
1796          * We have not set up the UColTokenParser with a callback function
1797          * to fetch [import] sub-rules,
1798          * so skip testing tailorings that import others.
1799          * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet()
1800          *                     rather than the internal collation rule parser
1801          */
1802         if (ruleLen > 0 && u_strstr(rules, IMPORT) == NULL) {
1803             rulesCopy = (UChar *)uprv_malloc((ruleLen +
1804                 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
1805             uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
1806             src.current = src.source = rulesCopy;
1807             src.end = rulesCopy + ruleLen;
1808             src.extraCurrent = src.end;
1809             src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1810 
1811 	        /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
1812 	           the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
1813             while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL && U_SUCCESS(status)) {
1814               strength = src.parsedToken.strength;
1815               chOffset = src.parsedToken.charsOffset;
1816               chLen = src.parsedToken.charsLen;
1817               exOffset = src.parsedToken.extensionOffset;
1818               exLen = src.parsedToken.extensionLen;
1819               prefixOffset = src.parsedToken.prefixOffset;
1820               prefixLen = src.parsedToken.prefixLen;
1821               specs = src.parsedToken.flags;
1822 
1823                 startOfRules = FALSE;
1824                 uprv_memcpy(codepoints, src.source + chOffset,
1825                                                        chLen * sizeof(UChar));
1826                 codepoints[chLen] = 0;
1827                 checkCEValidity(coll, codepoints, chLen);
1828             }
1829             if (U_FAILURE(status)) {
1830                 log_err("%s collator, ucol_tok_parseNextToken failed with status %s\n", loc, u_errorName(status));
1831             }
1832             uprv_free(src.source);
1833             uprv_free(src.reorderCodes);
1834         }
1835 
1836         ucol_close(coll);
1837         count ++;
1838     }
1839     T_FileStream_close(file);
1840 }
1841 
printSortKeyError(const UChar * codepoints,int length,uint8_t * sortkey,int sklen)1842 static void printSortKeyError(const UChar   *codepoints, int length,
1843                                     uint8_t *sortkey, int sklen)
1844 {
1845     int count = 0;
1846     log_err("Sortkey not valid for ");
1847     while (length > 0) {
1848         log_err("0x%04x ", *codepoints);
1849         length --;
1850         codepoints ++;
1851     }
1852     log_err("\nSortkey : ");
1853     while (count < sklen) {
1854         log_err("0x%02x ", sortkey[count]);
1855         count ++;
1856     }
1857     log_err("\n");
1858 }
1859 
1860 /**
1861 * Checking sort key validity for all levels
1862 */
checkSortKeyValidity(UCollator * coll,const UChar * codepoints,int length)1863 static UBool checkSortKeyValidity(UCollator *coll,
1864                                   const UChar *codepoints,
1865                                   int length)
1866 {
1867     UErrorCode status  = U_ZERO_ERROR;
1868     UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY,
1869                                       UCOL_TERTIARY, UCOL_QUATERNARY,
1870                                       UCOL_IDENTICAL};
1871     int        strengthlen = 5;
1872     int        strengthIndex = 0;
1873     int        caselevel   = 0;
1874 
1875     while (caselevel < 1) {
1876         if (caselevel == 0) {
1877             ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status);
1878         }
1879         else {
1880             ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status);
1881         }
1882 
1883         while (strengthIndex < strengthlen) {
1884             int        count01 = 0;
1885             uint32_t   count   = 0;
1886             uint8_t    sortkey[128];
1887             uint32_t   sklen;
1888 
1889             ucol_setStrength(coll, strength[strengthIndex]);
1890             sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128);
1891             while (sortkey[count] != 0) {
1892                 if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && strengthIndex != 4)) {
1893                     printSortKeyError(codepoints, length, sortkey, sklen);
1894                     return FALSE;
1895                 }
1896                 if (sortkey[count] == 1) {
1897                     count01 ++;
1898                 }
1899                 count ++;
1900             }
1901 
1902             if (count + 1 != sklen || (count01 != strengthIndex + caselevel)) {
1903                 printSortKeyError(codepoints, length, sortkey, sklen);
1904                 return FALSE;
1905             }
1906             strengthIndex ++;
1907         }
1908         caselevel ++;
1909     }
1910     return TRUE;
1911 }
1912 
TestSortKeyValidity(void)1913 static void TestSortKeyValidity(void)
1914 {
1915     /* testing UCA collation elements */
1916     UErrorCode  status      = U_ZERO_ERROR;
1917     /* en_US has no tailorings */
1918     UCollator  *coll        = ucol_open("en_US", &status);
1919     /* tailored locales */
1920     char        locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"};
1921     FileStream *file = NULL;
1922     char        line[2048];
1923     UChar       codepoints[10];
1924     int         count = 0;
1925     UChar       contextCPs[5];
1926     UParseError parseError;
1927     if (U_FAILURE(status)) {
1928         log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status));
1929         return;
1930     }
1931     log_verbose("Testing UCA elements\n");
1932     file = getFractionalUCA();
1933     if (file == NULL) {
1934         log_err("Fractional UCA data can not be opened\n");
1935         return;
1936     }
1937 
1938     while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1939         if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1940             line[0] == 0x000D || line[0] == '[') {
1941             continue;
1942         }
1943 
1944         getCodePoints(line, codepoints, contextCPs);
1945         if(codepoints[0] == 0xFFFE) {
1946             /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
1947             continue;
1948         }
1949         checkSortKeyValidity(coll, codepoints, u_strlen(codepoints));
1950     }
1951 
1952     log_verbose("Testing UCA elements for the whole range of unicode characters\n");
1953     codepoints[0] = 0;
1954 
1955     while (codepoints[0] < 0xFFFF) {
1956         if (u_isdefined((UChar32)codepoints[0])) {
1957             checkSortKeyValidity(coll, codepoints, 1);
1958         }
1959         codepoints[0] ++;
1960     }
1961 
1962     ucol_close(coll);
1963 
1964     /* testing tailored collation elements */
1965     log_verbose("Testing tailored elements\n");
1966     while (count < 5) {
1967         const UChar *rules = NULL,
1968                     *current = NULL;
1969         UChar *rulesCopy = NULL;
1970         int32_t ruleLen = 0;
1971 
1972         uint32_t chOffset = 0;
1973         uint32_t chLen = 0;
1974         uint32_t exOffset = 0;
1975         uint32_t exLen = 0;
1976         uint32_t prefixOffset = 0;
1977         uint32_t prefixLen = 0;
1978         UBool    startOfRules = TRUE;
1979         UColOptionSet opts;
1980 
1981         UColTokenParser src;
1982         uint32_t strength = 0;
1983         uint16_t specs = 0;
1984         status = U_ZERO_ERROR; // clear status from previous loop iteration
1985 
1986         uprv_memset(&src, 0, sizeof(UColTokenParser));
1987 
1988         coll      = ucol_open(locale[count], &status);
1989         if (U_FAILURE(status)) {
1990             log_err("%s collator creation failed with status %s\n", locale[count], u_errorName(status));
1991             return;
1992         }
1993 
1994         src.opts = &opts;
1995         rules = ucol_getRules(coll, &ruleLen);
1996 
1997         /*
1998          * We have not set up the UColTokenParser with a callback function
1999          * to fetch [import] sub-rules,
2000          * so skip testing tailorings that import others.
2001          * TODO: Ticket #8047: Change TestSortKeyValidity to use ucol_getTailoredSet()
2002          *                     rather than the internal collation rule parser
2003          */
2004         if (ruleLen > 0 && u_strstr(rules, IMPORT) == NULL) {
2005             rulesCopy = (UChar *)uprv_malloc((ruleLen +
2006                 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
2007             uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
2008             src.current = src.source = rulesCopy;
2009             src.end = rulesCopy + ruleLen;
2010             src.extraCurrent = src.end;
2011             src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
2012 
2013 	        /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
2014 	           the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
2015             while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL && U_SUCCESS(status)) {
2016                 strength = src.parsedToken.strength;
2017                 chOffset = src.parsedToken.charsOffset;
2018                 chLen = src.parsedToken.charsLen;
2019                 exOffset = src.parsedToken.extensionOffset;
2020                 exLen = src.parsedToken.extensionLen;
2021                 prefixOffset = src.parsedToken.prefixOffset;
2022                 prefixLen = src.parsedToken.prefixLen;
2023                 specs = src.parsedToken.flags;
2024 
2025                 startOfRules = FALSE;
2026                 uprv_memcpy(codepoints, src.source + chOffset,
2027                                                        chLen * sizeof(UChar));
2028                 codepoints[chLen] = 0;
2029                 if(codepoints[0] == 0xFFFE) {
2030                     /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
2031                     continue;
2032                 }
2033                 checkSortKeyValidity(coll, codepoints, chLen);
2034             }
2035             if (U_FAILURE(status)) {
2036                 log_err("%s collator, ucol_tok_parseNextToken failed with status %s\n", locale[count], u_errorName(status));
2037             }
2038             uprv_free(src.source);
2039             uprv_free(src.reorderCodes);
2040         }
2041 
2042         ucol_close(coll);
2043         count ++;
2044     }
2045     T_FileStream_close(file);
2046 }
2047 
2048 /**
2049 * TestSearchCollatorElements tests iterator behavior (forwards and backwards) with
2050 * normalization on AND jamo tailoring, among other things.
2051 */
2052 static const UChar tsceText[] = {   /* Nothing in here should be ignorable */
2053     0x0020, 0xAC00,                 /* simple LV Hangul */
2054     0x0020, 0xAC01,                 /* simple LVT Hangul */
2055     0x0020, 0xAC0F,                 /* LVTT, last jamo expands for search */
2056     0x0020, 0xAFFF,                 /* LLVVVTT, every jamo expands for search */
2057     0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */
2058     0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */
2059     0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands for search */
2060     0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand for search */
2061     0x0020, 0x00E6,                 /* small letter ae, expands */
2062     0x0020, 0x1E4D,                 /* small letter o with tilde and acute, decomposes */
2063     0x0020
2064 };
2065 enum { kLen_tsceText = sizeof(tsceText)/sizeof(tsceText[0]) };
2066 
2067 static const int32_t rootStandardOffsets[] = {
2068     0,  1,2,
2069     2,  3,4,4,
2070     4,  5,6,6,
2071     6,  7,8,8,
2072     8,  9,10,11,
2073     12, 13,14,15,
2074     16, 17,18,19,
2075     20, 21,22,23,
2076     24, 25,26,26,26,
2077     26, 27,28,28,
2078     28,
2079     29
2080 };
2081 enum { kLen_rootStandardOffsets = sizeof(rootStandardOffsets)/sizeof(rootStandardOffsets[0]) };
2082 
2083 static const int32_t rootSearchOffsets[] = {
2084     0,  1,2,
2085     2,  3,4,4,
2086     4,  5,6,6,6,
2087     6,  7,8,8,8,8,8,8,
2088     8,  9,10,11,
2089     12, 13,14,15,
2090     16, 17,18,19,20,
2091     20, 21,22,22,23,23,23,24,
2092     24, 25,26,26,26,
2093     26, 27,28,28,
2094     28,
2095     29
2096 };
2097 enum { kLen_rootSearchOffsets = sizeof(rootSearchOffsets)/sizeof(rootSearchOffsets[0]) };
2098 
2099 typedef struct {
2100     const char *    locale;
2101     const int32_t * offsets;
2102     int32_t         offsetsLen;
2103 } TSCEItem;
2104 
2105 static const TSCEItem tsceItems[] = {
2106     { "root",                  rootStandardOffsets, kLen_rootStandardOffsets },
2107     { "root@collation=search", rootSearchOffsets,   kLen_rootSearchOffsets   },
2108     { NULL,                    NULL,                0                        }
2109 };
2110 
TestSearchCollatorElements(void)2111 static void TestSearchCollatorElements(void)
2112 {
2113     const TSCEItem * tsceItemPtr;
2114     for (tsceItemPtr = tsceItems; tsceItemPtr->locale != NULL; tsceItemPtr++) {
2115         UErrorCode status = U_ZERO_ERROR;
2116         UCollator* ucol = ucol_open(tsceItemPtr->locale, &status);
2117         if ( U_SUCCESS(status) ) {
2118             UCollationElements * uce = ucol_openElements(ucol, tsceText, kLen_tsceText, &status);
2119             if ( U_SUCCESS(status) ) {
2120                 int32_t offset, element;
2121                 const int32_t * nextOffsetPtr;
2122                 const int32_t * limitOffsetPtr;
2123 
2124                 nextOffsetPtr = tsceItemPtr->offsets;
2125                 limitOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen;
2126                 do {
2127                     offset = ucol_getOffset(uce);
2128                     element = ucol_next(uce, &status);
2129                     if ( element == 0 ) {
2130                         log_err("error, locale %s, ucol_next returned element 0\n", tsceItemPtr->locale );
2131                     }
2132                     if ( nextOffsetPtr < limitOffsetPtr ) {
2133                         if (offset != *nextOffsetPtr) {
2134                             log_err("error, locale %s, expected ucol_next -> ucol_getOffset %d, got %d\n",
2135                                                             tsceItemPtr->locale, *nextOffsetPtr, offset );
2136                             nextOffsetPtr = limitOffsetPtr;
2137                             break;
2138                         }
2139                         nextOffsetPtr++;
2140                     } else {
2141                         log_err("error, locale %s, ucol_next returned more elements than expected\n", tsceItemPtr->locale );
2142                     }
2143                 } while ( U_SUCCESS(status) && element != UCOL_NULLORDER );
2144                 if ( nextOffsetPtr < limitOffsetPtr ) {
2145                     log_err("error, locale %s, ucol_next returned fewer elements than expected\n", tsceItemPtr->locale );
2146                 }
2147 
2148                 ucol_setOffset(uce, kLen_tsceText, &status);
2149                 status = U_ZERO_ERROR;
2150                 nextOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen;
2151                 limitOffsetPtr = tsceItemPtr->offsets;
2152                 do {
2153                     offset = ucol_getOffset(uce);
2154                     element = ucol_previous(uce, &status);
2155                     if ( element == 0 ) {
2156                         log_err("error, locale %s, ucol_previous returned element 0\n", tsceItemPtr->locale );
2157                     }
2158                     if ( nextOffsetPtr > limitOffsetPtr ) {
2159                         nextOffsetPtr--;
2160                         if (offset != *nextOffsetPtr) {
2161                             log_err("error, locale %s, expected ucol_previous -> ucol_getOffset %d, got %d\n",
2162                                                                 tsceItemPtr->locale, *nextOffsetPtr, offset );
2163                             nextOffsetPtr = limitOffsetPtr;
2164                             break;
2165                         }
2166                    } else {
2167                         log_err("error, locale %s, ucol_previous returned more elements than expected\n", tsceItemPtr->locale );
2168                     }
2169                 } while ( U_SUCCESS(status) && element != UCOL_NULLORDER );
2170                 if ( nextOffsetPtr > limitOffsetPtr ) {
2171                     log_err("error, locale %s, ucol_previous returned fewer elements than expected\n", tsceItemPtr->locale );
2172                 }
2173 
2174                 ucol_closeElements(uce);
2175             } else {
2176                 log_err("error, locale %s, ucol_openElements failed: %s\n", tsceItemPtr->locale, u_errorName(status) );
2177             }
2178             ucol_close(ucol);
2179         } else {
2180             log_data_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr->locale, u_errorName(status) );
2181         }
2182     }
2183 }
2184 
2185 #endif /* #if !UCONFIG_NO_COLLATION */
2186