• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 1997-2010, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 /********************************************************************************
7 *
8 * File CITERTST.C
9 *
10 * Modification History:
11 * Date      Name               Description
12 *           Madhu Katragadda   Ported for C API
13 * 02/19/01  synwee             Modified test case for new collation iterator
14 *********************************************************************************/
15 /*
16  * Collation Iterator tests.
17  * (Let me reiterate my position...)
18  */
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_COLLATION
23 
24 #include "unicode/ucol.h"
25 #include "unicode/uloc.h"
26 #include "unicode/uchar.h"
27 #include "unicode/ustring.h"
28 #include "unicode/putil.h"
29 #include "callcoll.h"
30 #include "cmemory.h"
31 #include "cintltst.h"
32 #include "citertst.h"
33 #include "ccolltst.h"
34 #include "filestrm.h"
35 #include "cstring.h"
36 #include "ucol_imp.h"
37 #include "ucol_tok.h"
38 #include "uparse.h"
39 #include <stdio.h>
40 
41 extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *);
42 
addCollIterTest(TestNode ** root)43 void addCollIterTest(TestNode** root)
44 {
45     addTest(root, &TestPrevious, "tscoll/citertst/TestPrevious");
46     addTest(root, &TestOffset, "tscoll/citertst/TestOffset");
47     addTest(root, &TestSetText, "tscoll/citertst/TestSetText");
48     addTest(root, &TestMaxExpansion, "tscoll/citertst/TestMaxExpansion");
49     addTest(root, &TestUnicodeChar, "tscoll/citertst/TestUnicodeChar");
50     addTest(root, &TestNormalizedUnicodeChar,
51                                 "tscoll/citertst/TestNormalizedUnicodeChar");
52     addTest(root, &TestNormalization, "tscoll/citertst/TestNormalization");
53     addTest(root, &TestBug672, "tscoll/citertst/TestBug672");
54     addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize");
55     addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer");
56     addTest(root, &TestCEs, "tscoll/citertst/TestCEs");
57     addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos");
58     addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow");
59     addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity");
60     addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity");
61 }
62 
63 /* The locales we support */
64 
65 static const char * LOCALES[] = {"en_AU", "en_BE", "en_CA"};
66 
TestBug672()67 static void TestBug672() {
68     UErrorCode  status = U_ZERO_ERROR;
69     UChar       pattern[20];
70     UChar       text[50];
71     int         i;
72     int         result[3][3];
73 
74     u_uastrcpy(pattern, "resume");
75     u_uastrcpy(text, "Time to resume updating my resume.");
76 
77     for (i = 0; i < 3; ++ i) {
78         UCollator          *coll = ucol_open(LOCALES[i], &status);
79         UCollationElements *pitr = ucol_openElements(coll, pattern, -1,
80                                                      &status);
81         UCollationElements *titer = ucol_openElements(coll, text, -1,
82                                                      &status);
83         if (U_FAILURE(status)) {
84             log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n",
85                     myErrorName(status));
86             return;
87         }
88 
89         log_verbose("locale tested %s\n", LOCALES[i]);
90 
91         while (ucol_next(pitr, &status) != UCOL_NULLORDER &&
92                U_SUCCESS(status)) {
93         }
94         if (U_FAILURE(status)) {
95             log_err("ERROR: reversing collation iterator :%s\n",
96                     myErrorName(status));
97             return;
98         }
99         ucol_reset(pitr);
100 
101         ucol_setOffset(titer, u_strlen(pattern), &status);
102         if (U_FAILURE(status)) {
103             log_err("ERROR: setting offset in collator :%s\n",
104                     myErrorName(status));
105             return;
106         }
107         result[i][0] = ucol_getOffset(titer);
108         log_verbose("Text iterator set to offset %d\n", result[i][0]);
109 
110         /* Use previous() */
111         ucol_previous(titer, &status);
112         result[i][1] = ucol_getOffset(titer);
113         log_verbose("Current offset %d after previous\n", result[i][1]);
114 
115         /* Add one to index */
116         log_verbose("Adding one to current offset...\n");
117         ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status);
118         if (U_FAILURE(status)) {
119             log_err("ERROR: setting offset in collator :%s\n",
120                     myErrorName(status));
121             return;
122         }
123         result[i][2] = ucol_getOffset(titer);
124         log_verbose("Current offset in text = %d\n", result[i][2]);
125         ucol_closeElements(pitr);
126         ucol_closeElements(titer);
127         ucol_close(coll);
128     }
129 
130     if (uprv_memcmp(result[0], result[1], 3) != 0 ||
131         uprv_memcmp(result[1], result[2], 3) != 0) {
132         log_err("ERROR: Different locales have different offsets at the same character\n");
133     }
134 }
135 
136 
137 
138 /*  Running this test with normalization enabled showed up a bug in the incremental
139     normalization code. */
TestBug672Normalize()140 static void TestBug672Normalize() {
141     UErrorCode  status = U_ZERO_ERROR;
142     UChar       pattern[20];
143     UChar       text[50];
144     int         i;
145     int         result[3][3];
146 
147     u_uastrcpy(pattern, "resume");
148     u_uastrcpy(text, "Time to resume updating my resume.");
149 
150     for (i = 0; i < 3; ++ i) {
151         UCollator          *coll = ucol_open(LOCALES[i], &status);
152         UCollationElements *pitr = NULL;
153         UCollationElements *titer = NULL;
154 
155         ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
156 
157         pitr = ucol_openElements(coll, pattern, -1, &status);
158         titer = ucol_openElements(coll, text, -1, &status);
159         if (U_FAILURE(status)) {
160             log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n",
161                     myErrorName(status));
162             return;
163         }
164 
165         log_verbose("locale tested %s\n", LOCALES[i]);
166 
167         while (ucol_next(pitr, &status) != UCOL_NULLORDER &&
168                U_SUCCESS(status)) {
169         }
170         if (U_FAILURE(status)) {
171             log_err("ERROR: reversing collation iterator :%s\n",
172                     myErrorName(status));
173             return;
174         }
175         ucol_reset(pitr);
176 
177         ucol_setOffset(titer, u_strlen(pattern), &status);
178         if (U_FAILURE(status)) {
179             log_err("ERROR: setting offset in collator :%s\n",
180                     myErrorName(status));
181             return;
182         }
183         result[i][0] = ucol_getOffset(titer);
184         log_verbose("Text iterator set to offset %d\n", result[i][0]);
185 
186         /* Use previous() */
187         ucol_previous(titer, &status);
188         result[i][1] = ucol_getOffset(titer);
189         log_verbose("Current offset %d after previous\n", result[i][1]);
190 
191         /* Add one to index */
192         log_verbose("Adding one to current offset...\n");
193         ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status);
194         if (U_FAILURE(status)) {
195             log_err("ERROR: setting offset in collator :%s\n",
196                     myErrorName(status));
197             return;
198         }
199         result[i][2] = ucol_getOffset(titer);
200         log_verbose("Current offset in text = %d\n", result[i][2]);
201         ucol_closeElements(pitr);
202         ucol_closeElements(titer);
203         ucol_close(coll);
204     }
205 
206     if (uprv_memcmp(result[0], result[1], 3) != 0 ||
207         uprv_memcmp(result[1], result[2], 3) != 0) {
208         log_err("ERROR: Different locales have different offsets at the same character\n");
209     }
210 }
211 
212 
213 
214 
215 /**
216  * Test for CollationElementIterator previous and next for the whole set of
217  * unicode characters.
218  */
TestUnicodeChar()219 static void TestUnicodeChar()
220 {
221     UChar source[0x100];
222     UCollator *en_us;
223     UCollationElements *iter;
224     UErrorCode status = U_ZERO_ERROR;
225     UChar codepoint;
226 
227     UChar *test;
228     en_us = ucol_open("en_US", &status);
229     if (U_FAILURE(status)){
230        log_err_status(status, "ERROR: in creation of collation data using ucol_open()\n %s\n",
231               myErrorName(status));
232        return;
233     }
234 
235     for (codepoint = 1; codepoint < 0xFFFE;)
236     {
237       test = source;
238 
239       while (codepoint % 0xFF != 0)
240       {
241         if (u_isdefined(codepoint))
242           *(test ++) = codepoint;
243         codepoint ++;
244       }
245 
246       if (u_isdefined(codepoint))
247         *(test ++) = codepoint;
248 
249       if (codepoint != 0xFFFF)
250         codepoint ++;
251 
252       *test = 0;
253       iter=ucol_openElements(en_us, source, u_strlen(source), &status);
254       if(U_FAILURE(status)){
255           log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
256               myErrorName(status));
257           ucol_close(en_us);
258           return;
259       }
260       /* A basic test to see if it's working at all */
261       log_verbose("codepoint testing %x\n", codepoint);
262       backAndForth(iter);
263       ucol_closeElements(iter);
264 
265       /* null termination test */
266       iter=ucol_openElements(en_us, source, -1, &status);
267       if(U_FAILURE(status)){
268           log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
269               myErrorName(status));
270           ucol_close(en_us);
271           return;
272       }
273       /* A basic test to see if it's working at all */
274       backAndForth(iter);
275       ucol_closeElements(iter);
276     }
277 
278     ucol_close(en_us);
279 }
280 
281 /**
282  * Test for CollationElementIterator previous and next for the whole set of
283  * unicode characters with normalization on.
284  */
TestNormalizedUnicodeChar()285 static void TestNormalizedUnicodeChar()
286 {
287     UChar source[0x100];
288     UCollator *th_th;
289     UCollationElements *iter;
290     UErrorCode status = U_ZERO_ERROR;
291     UChar codepoint;
292 
293     UChar *test;
294     /* thai should have normalization on */
295     th_th = ucol_open("th_TH", &status);
296     if (U_FAILURE(status)){
297         log_err_status(status, "ERROR: in creation of thai collation using ucol_open()\n %s\n",
298               myErrorName(status));
299         return;
300     }
301 
302     for (codepoint = 1; codepoint < 0xFFFE;)
303     {
304       test = source;
305 
306       while (codepoint % 0xFF != 0)
307       {
308         if (u_isdefined(codepoint))
309           *(test ++) = codepoint;
310         codepoint ++;
311       }
312 
313       if (u_isdefined(codepoint))
314         *(test ++) = codepoint;
315 
316       if (codepoint != 0xFFFF)
317         codepoint ++;
318 
319       *test = 0;
320       iter=ucol_openElements(th_th, source, u_strlen(source), &status);
321       if(U_FAILURE(status)){
322           log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
323               myErrorName(status));
324             ucol_close(th_th);
325           return;
326       }
327 
328       backAndForth(iter);
329       ucol_closeElements(iter);
330 
331       iter=ucol_openElements(th_th, source, -1, &status);
332       if(U_FAILURE(status)){
333           log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
334               myErrorName(status));
335             ucol_close(th_th);
336           return;
337       }
338 
339       backAndForth(iter);
340       ucol_closeElements(iter);
341     }
342 
343     ucol_close(th_th);
344 }
345 
346 /**
347 * Test the incremental normalization
348 */
TestNormalization()349 static void TestNormalization()
350 {
351           UErrorCode          status = U_ZERO_ERROR;
352     const char               *str    =
353                             "&a < \\u0300\\u0315 < A\\u0300\\u0315 < \\u0316\\u0315B < \\u0316\\u0300\\u0315";
354           UCollator          *coll;
355           UChar               rule[50];
356           int                 rulelen = u_unescape(str, rule, 50);
357           int                 count = 0;
358     const char                *testdata[] =
359                         {"\\u1ED9", "o\\u0323\\u0302",
360                         "\\u0300\\u0315", "\\u0315\\u0300",
361                         "A\\u0300\\u0315B", "A\\u0315\\u0300B",
362                         "A\\u0316\\u0315B", "A\\u0315\\u0316B",
363                         "\\u0316\\u0300\\u0315", "\\u0315\\u0300\\u0316",
364                         "A\\u0316\\u0300\\u0315B", "A\\u0315\\u0300\\u0316B",
365                         "\\u0316\\u0315\\u0300", "A\\u0316\\u0315\\u0300B"};
366     int32_t   srclen;
367     UChar source[10];
368     UCollationElements *iter;
369 
370     coll = ucol_openRules(rule, rulelen, UCOL_ON, UCOL_TERTIARY, NULL, &status);
371     ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
372     if (U_FAILURE(status)){
373         log_err_status(status, "ERROR: in creation of collator using ucol_openRules()\n %s\n",
374               myErrorName(status));
375         return;
376     }
377 
378     srclen = u_unescape(testdata[0], source, 10);
379     iter = ucol_openElements(coll, source, srclen, &status);
380     backAndForth(iter);
381     ucol_closeElements(iter);
382 
383     srclen = u_unescape(testdata[1], source, 10);
384     iter = ucol_openElements(coll, source, srclen, &status);
385     backAndForth(iter);
386     ucol_closeElements(iter);
387 
388     while (count < 12) {
389         srclen = u_unescape(testdata[count], source, 10);
390         iter = ucol_openElements(coll, source, srclen, &status);
391 
392         if (U_FAILURE(status)){
393             log_err("ERROR: in creation of collator element iterator\n %s\n",
394                   myErrorName(status));
395             return;
396         }
397         backAndForth(iter);
398         ucol_closeElements(iter);
399 
400         iter = ucol_openElements(coll, source, -1, &status);
401 
402         if (U_FAILURE(status)){
403             log_err("ERROR: in creation of collator element iterator\n %s\n",
404                   myErrorName(status));
405             return;
406         }
407         backAndForth(iter);
408         ucol_closeElements(iter);
409         count ++;
410     }
411     ucol_close(coll);
412 }
413 
414 /**
415  * Test for CollationElementIterator.previous()
416  *
417  * @bug 4108758 - Make sure it works with contracting characters
418  *
419  */
TestPrevious()420 static void TestPrevious()
421 {
422     UCollator *coll=NULL;
423     UChar rule[50];
424     UChar *source;
425     UCollator *c1, *c2, *c3;
426     UCollationElements *iter;
427     UErrorCode status = U_ZERO_ERROR;
428     UChar test1[50];
429     UChar test2[50];
430 
431     u_uastrcpy(test1, "What subset of all possible test cases?");
432     u_uastrcpy(test2, "has the highest probability of detecting");
433     coll = ucol_open("en_US", &status);
434 
435     iter=ucol_openElements(coll, test1, u_strlen(test1), &status);
436     log_verbose("English locale testing back and forth\n");
437     if(U_FAILURE(status)){
438         log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
439             myErrorName(status));
440         ucol_close(coll);
441         return;
442     }
443     /* A basic test to see if it's working at all */
444     backAndForth(iter);
445     ucol_closeElements(iter);
446     ucol_close(coll);
447 
448     /* Test with a contracting character sequence */
449     u_uastrcpy(rule, "&a,A < b,B < c,C, d,D < z,Z < ch,cH,Ch,CH");
450     c1 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status);
451 
452     log_verbose("Contraction rule testing back and forth with no normalization\n");
453 
454     if (c1 == NULL || U_FAILURE(status))
455     {
456         log_err("Couldn't create a RuleBasedCollator with a contracting sequence\n %s\n",
457             myErrorName(status));
458         return;
459     }
460     source=(UChar*)malloc(sizeof(UChar) * 20);
461     u_uastrcpy(source, "abchdcba");
462     iter=ucol_openElements(c1, source, u_strlen(source), &status);
463     if(U_FAILURE(status)){
464         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
465             myErrorName(status));
466         return;
467     }
468     backAndForth(iter);
469     ucol_closeElements(iter);
470     ucol_close(c1);
471 
472     /* Test with an expanding character sequence */
473     u_uastrcpy(rule, "&a < b < c/abd < d");
474     c2 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status);
475     log_verbose("Expansion rule testing back and forth with no normalization\n");
476     if (c2 == NULL || U_FAILURE(status))
477     {
478         log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n",
479             myErrorName(status));
480         return;
481     }
482     u_uastrcpy(source, "abcd");
483     iter=ucol_openElements(c2, source, u_strlen(source), &status);
484     if(U_FAILURE(status)){
485         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
486             myErrorName(status));
487         return;
488     }
489     backAndForth(iter);
490     ucol_closeElements(iter);
491     ucol_close(c2);
492     /* Now try both */
493     u_uastrcpy(rule, "&a < b < c/aba < d < z < ch");
494     c3 = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,  UCOL_DEFAULT_STRENGTH,NULL, &status);
495     log_verbose("Expansion/contraction rule testing back and forth with no normalization\n");
496 
497     if (c3 == NULL || U_FAILURE(status))
498     {
499         log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n",
500             myErrorName(status));
501         return;
502     }
503     u_uastrcpy(source, "abcdbchdc");
504     iter=ucol_openElements(c3, source, u_strlen(source), &status);
505     if(U_FAILURE(status)){
506         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
507             myErrorName(status));
508         return;
509     }
510     backAndForth(iter);
511     ucol_closeElements(iter);
512     ucol_close(c3);
513     source[0] = 0x0e41;
514     source[1] = 0x0e02;
515     source[2] = 0x0e41;
516     source[3] = 0x0e02;
517     source[4] = 0x0e27;
518     source[5] = 0x61;
519     source[6] = 0x62;
520     source[7] = 0x63;
521     source[8] = 0;
522 
523     coll = ucol_open("th_TH", &status);
524     log_verbose("Thai locale testing back and forth with normalization\n");
525     iter=ucol_openElements(coll, source, u_strlen(source), &status);
526     if(U_FAILURE(status)){
527         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
528             myErrorName(status));
529         return;
530     }
531     backAndForth(iter);
532     ucol_closeElements(iter);
533     ucol_close(coll);
534 
535     /* prev test */
536     source[0] = 0x0061;
537     source[1] = 0x30CF;
538     source[2] = 0x3099;
539     source[3] = 0x30FC;
540     source[4] = 0;
541 
542     coll = ucol_open("ja_JP", &status);
543     log_verbose("Japanese locale testing back and forth with normalization\n");
544     iter=ucol_openElements(coll, source, u_strlen(source), &status);
545     if(U_FAILURE(status)){
546         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
547             myErrorName(status));
548         return;
549     }
550     backAndForth(iter);
551     ucol_closeElements(iter);
552     ucol_close(coll);
553 
554     free(source);
555 }
556 
557 /**
558  * Test for getOffset() and setOffset()
559  */
TestOffset()560 static void TestOffset()
561 {
562     UErrorCode status= U_ZERO_ERROR;
563     UCollator *en_us=NULL;
564     UCollationElements *iter, *pristine;
565     int32_t offset;
566     OrderAndOffset *orders;
567     int32_t orderLength=0;
568     int     count = 0;
569     UChar test1[50];
570     UChar test2[50];
571 
572     u_uastrcpy(test1, "What subset of all possible test cases?");
573     u_uastrcpy(test2, "has the highest probability of detecting");
574     en_us = ucol_open("en_US", &status);
575     log_verbose("Testing getOffset and setOffset for collations\n");
576     iter = ucol_openElements(en_us, test1, u_strlen(test1), &status);
577     if(U_FAILURE(status)){
578         log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
579             myErrorName(status));
580         ucol_close(en_us);
581         return;
582     }
583 
584     /* testing boundaries */
585     ucol_setOffset(iter, 0, &status);
586     if (U_FAILURE(status) || ucol_previous(iter, &status) != UCOL_NULLORDER) {
587         log_err("Error: After setting offset to 0, we should be at the end "
588                 "of the backwards iteration");
589     }
590     ucol_setOffset(iter, u_strlen(test1), &status);
591     if (U_FAILURE(status) || ucol_next(iter, &status) != UCOL_NULLORDER) {
592         log_err("Error: After setting offset to end of the string, we should "
593                 "be at the end of the backwards iteration");
594     }
595 
596     /* Run all the way through the iterator, then get the offset */
597 
598     orders = getOrders(iter, &orderLength);
599 
600     offset = ucol_getOffset(iter);
601 
602     if (offset != u_strlen(test1))
603     {
604         log_err("offset at end != length %d vs %d\n", offset,
605             u_strlen(test1) );
606     }
607 
608     /* Now set the offset back to the beginning and see if it works */
609     pristine=ucol_openElements(en_us, test1, u_strlen(test1), &status);
610     if(U_FAILURE(status)){
611         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
612             myErrorName(status));
613     ucol_close(en_us);
614         return;
615     }
616     status = U_ZERO_ERROR;
617 
618     ucol_setOffset(iter, 0, &status);
619     if (U_FAILURE(status))
620     {
621         log_err("setOffset failed. %s\n",    myErrorName(status));
622     }
623     else
624     {
625         assertEqual(iter, pristine);
626     }
627 
628     ucol_closeElements(pristine);
629     ucol_closeElements(iter);
630     free(orders);
631 
632     /* testing offsets in normalization buffer */
633     test1[0] = 0x61;
634     test1[1] = 0x300;
635     test1[2] = 0x316;
636     test1[3] = 0x62;
637     test1[4] = 0;
638     ucol_setAttribute(en_us, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
639     iter = ucol_openElements(en_us, test1, 4, &status);
640     if(U_FAILURE(status)){
641         log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
642             myErrorName(status));
643         ucol_close(en_us);
644         return;
645     }
646 
647     count = 0;
648     while (ucol_next(iter, &status) != UCOL_NULLORDER &&
649         U_SUCCESS(status)) {
650         switch (count) {
651         case 0:
652             if (ucol_getOffset(iter) != 1) {
653                 log_err("ERROR: Offset of iteration should be 1\n");
654             }
655             break;
656         case 3:
657             if (ucol_getOffset(iter) != 4) {
658                 log_err("ERROR: Offset of iteration should be 4\n");
659             }
660             break;
661         default:
662             if (ucol_getOffset(iter) != 3) {
663                 log_err("ERROR: Offset of iteration should be 3\n");
664             }
665         }
666         count ++;
667     }
668 
669     ucol_reset(iter);
670     count = 0;
671     while (ucol_previous(iter, &status) != UCOL_NULLORDER &&
672         U_SUCCESS(status)) {
673         switch (count) {
674         case 0:
675         case 1:
676             if (ucol_getOffset(iter) != 3) {
677                 log_err("ERROR: Offset of iteration should be 3\n");
678             }
679             break;
680         case 2:
681             if (ucol_getOffset(iter) != 1) {
682                 log_err("ERROR: Offset of iteration should be 1\n");
683             }
684             break;
685         default:
686             if (ucol_getOffset(iter) != 0) {
687                 log_err("ERROR: Offset of iteration should be 0\n");
688             }
689         }
690         count ++;
691     }
692 
693     if(U_FAILURE(status)){
694         log_err("ERROR: in iterating collation elements %s\n",
695             myErrorName(status));
696     }
697 
698     ucol_closeElements(iter);
699     ucol_close(en_us);
700 }
701 
702 /**
703  * Test for setText()
704  */
TestSetText()705 static void TestSetText()
706 {
707     int32_t c,i;
708     UErrorCode status = U_ZERO_ERROR;
709     UCollator *en_us=NULL;
710     UCollationElements *iter1, *iter2;
711     UChar test1[50];
712     UChar test2[50];
713 
714     u_uastrcpy(test1, "What subset of all possible test cases?");
715     u_uastrcpy(test2, "has the highest probability of detecting");
716     en_us = ucol_open("en_US", &status);
717     log_verbose("testing setText for Collation elements\n");
718     iter1=ucol_openElements(en_us, test1, u_strlen(test1), &status);
719     if(U_FAILURE(status)){
720         log_err_status(status, "ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n",
721             myErrorName(status));
722     ucol_close(en_us);
723         return;
724     }
725     iter2=ucol_openElements(en_us, test2, u_strlen(test2), &status);
726     if(U_FAILURE(status)){
727         log_err("ERROR: in creation of collation element iterator2 using ucol_openElements()\n %s\n",
728             myErrorName(status));
729     ucol_close(en_us);
730         return;
731     }
732 
733     /* Run through the second iterator just to exercise it */
734     c = ucol_next(iter2, &status);
735     i = 0;
736 
737     while ( ++i < 10 && (c != UCOL_NULLORDER))
738     {
739         if (U_FAILURE(status))
740         {
741             log_err("iter2->next() returned an error. %s\n", myErrorName(status));
742             ucol_closeElements(iter2);
743             ucol_closeElements(iter1);
744     ucol_close(en_us);
745             return;
746         }
747 
748         c = ucol_next(iter2, &status);
749     }
750 
751     /* Now set it to point to the same string as the first iterator */
752     ucol_setText(iter2, test1, u_strlen(test1), &status);
753     if (U_FAILURE(status))
754     {
755         log_err("call to iter2->setText(test1) failed. %s\n", myErrorName(status));
756     }
757     else
758     {
759         assertEqual(iter1, iter2);
760     }
761 
762     /* Now set it to point to a null string with fake length*/
763     ucol_setText(iter2, NULL, 2, &status);
764     if (U_FAILURE(status))
765     {
766         log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status));
767     }
768     else
769     {
770         if (ucol_next(iter2, &status) != UCOL_NULLORDER) {
771             log_err("iter2 with null text expected to return UCOL_NULLORDER\n");
772         }
773     }
774 
775     ucol_closeElements(iter2);
776     ucol_closeElements(iter1);
777     ucol_close(en_us);
778 }
779 
780 /** @bug 4108762
781  * Test for getMaxExpansion()
782  */
TestMaxExpansion()783 static void TestMaxExpansion()
784 {
785     UErrorCode          status = U_ZERO_ERROR;
786     UCollator          *coll   ;/*= ucol_open("en_US", &status);*/
787     UChar               ch     = 0;
788     UChar32             unassigned = 0xEFFFD;
789     UChar               supplementary[2];
790     uint32_t            stringOffset = 0;
791     UBool               isError = FALSE;
792     uint32_t            sorder = 0;
793     UCollationElements *iter   ;/*= ucol_openElements(coll, &ch, 1, &status);*/
794     uint32_t            temporder = 0;
795 
796     UChar rule[256];
797     u_uastrcpy(rule, "&a < ab < c/aba < d < z < ch");
798     coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,
799         UCOL_DEFAULT_STRENGTH,NULL, &status);
800     if(U_SUCCESS(status) && coll) {
801       iter = ucol_openElements(coll, &ch, 1, &status);
802 
803       while (ch < 0xFFFF && U_SUCCESS(status)) {
804           int      count = 1;
805           uint32_t order;
806           int32_t  size = 0;
807 
808           ch ++;
809 
810           ucol_setText(iter, &ch, 1, &status);
811           order = ucol_previous(iter, &status);
812 
813           /* thai management */
814           if (order == 0)
815               order = ucol_previous(iter, &status);
816 
817           while (U_SUCCESS(status) &&
818               ucol_previous(iter, &status) != UCOL_NULLORDER) {
819               count ++;
820           }
821 
822           size = ucol_getMaxExpansion(iter, order);
823           if (U_FAILURE(status) || size < count) {
824               log_err("Failure at codepoint %d, maximum expansion count < %d\n",
825                   ch, count);
826           }
827       }
828 
829       /* testing for exact max expansion */
830       ch = 0;
831       while (ch < 0x61) {
832           uint32_t order;
833           int32_t  size;
834           ucol_setText(iter, &ch, 1, &status);
835           order = ucol_previous(iter, &status);
836           size  = ucol_getMaxExpansion(iter, order);
837           if (U_FAILURE(status) || size != 1) {
838               log_err("Failure at codepoint %d, maximum expansion count < %d\n",
839                   ch, 1);
840           }
841           ch ++;
842       }
843 
844       ch = 0x63;
845       ucol_setText(iter, &ch, 1, &status);
846       temporder = ucol_previous(iter, &status);
847 
848       if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 3) {
849           log_err("Failure at codepoint %d, maximum expansion count != %d\n",
850                   ch, 3);
851       }
852 
853       ch = 0x64;
854       ucol_setText(iter, &ch, 1, &status);
855       temporder = ucol_previous(iter, &status);
856 
857       if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 1) {
858           log_err("Failure at codepoint %d, maximum expansion count != %d\n",
859                   ch, 3);
860       }
861 
862       U16_APPEND(supplementary, stringOffset, 2, unassigned, isError);
863       ucol_setText(iter, supplementary, 2, &status);
864       sorder = ucol_previous(iter, &status);
865 
866       if (U_FAILURE(status) || ucol_getMaxExpansion(iter, sorder) != 2) {
867           log_err("Failure at codepoint %d, maximum expansion count < %d\n",
868                   ch, 2);
869       }
870 
871       /* testing jamo */
872       ch = 0x1165;
873 
874       ucol_setText(iter, &ch, 1, &status);
875       temporder = ucol_previous(iter, &status);
876       if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) > 3) {
877           log_err("Failure at codepoint %d, maximum expansion count > %d\n",
878                   ch, 3);
879       }
880 
881       ucol_closeElements(iter);
882       ucol_close(coll);
883 
884       /* testing special jamo &a<\u1160 */
885       rule[0] = 0x26;
886       rule[1] = 0x71;
887       rule[2] = 0x3c;
888       rule[3] = 0x1165;
889       rule[4] = 0x2f;
890       rule[5] = 0x71;
891       rule[6] = 0x71;
892       rule[7] = 0x71;
893       rule[8] = 0x71;
894       rule[9] = 0;
895 
896       coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,
897           UCOL_DEFAULT_STRENGTH,NULL, &status);
898       iter = ucol_openElements(coll, &ch, 1, &status);
899 
900       temporder = ucol_previous(iter, &status);
901       if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 6) {
902           log_err("Failure at codepoint %d, maximum expansion count > %d\n",
903                   ch, 5);
904       }
905 
906       ucol_closeElements(iter);
907       ucol_close(coll);
908     } else {
909       log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status));
910     }
911 
912 }
913 
914 
assertEqual(UCollationElements * i1,UCollationElements * i2)915 static void assertEqual(UCollationElements *i1, UCollationElements *i2)
916 {
917     int32_t c1, c2;
918     int32_t count = 0;
919     UErrorCode status = U_ZERO_ERROR;
920 
921     do
922     {
923         c1 = ucol_next(i1, &status);
924         c2 = ucol_next(i2, &status);
925 
926         if (c1 != c2)
927         {
928             log_err("Error in iteration %d assetEqual between\n  %d  and   %d, they are not equal\n", count, c1, c2);
929             break;
930         }
931 
932         count += 1;
933     }
934     while (c1 != UCOL_NULLORDER);
935 }
936 
937 /**
938  * Testing iterators with extremely small buffers
939  */
TestSmallBuffer()940 static void TestSmallBuffer()
941 {
942     UErrorCode          status = U_ZERO_ERROR;
943     UCollator          *coll;
944     UCollationElements *testiter,
945                        *iter;
946     int32_t             count = 0;
947     OrderAndOffset     *testorders,
948                        *orders;
949 
950     UChar teststr[500];
951     UChar str[] = {0x300, 0x31A, 0};
952     /*
953     creating a long string of decomposable characters,
954     since by default the writable buffer is of size 256
955     */
956     while (count < 500) {
957         if ((count & 1) == 0) {
958             teststr[count ++] = 0x300;
959         }
960         else {
961             teststr[count ++] = 0x31A;
962         }
963     }
964 
965     coll = ucol_open("th_TH", &status);
966     if(U_SUCCESS(status) && coll) {
967       testiter = ucol_openElements(coll, teststr, 500, &status);
968       iter = ucol_openElements(coll, str, 2, &status);
969 
970       orders     = getOrders(iter, &count);
971       if (count != 2) {
972           log_err("Error collation elements size is not 2 for \\u0300\\u031A\n");
973       }
974 
975       /*
976       this will rearrange the string data to 250 characters of 0x300 first then
977       250 characters of 0x031A
978       */
979       testorders = getOrders(testiter, &count);
980 
981       if (count != 500) {
982           log_err("Error decomposition does not give the right sized collation elements\n");
983       }
984 
985       while (count != 0) {
986           /* UCA collation element for 0x0F76 */
987           if ((count > 250 && testorders[-- count].order != orders[1].order) ||
988               (count <= 250 && testorders[-- count].order != orders[0].order)) {
989               log_err("Error decomposition does not give the right collation element at %d count\n", count);
990               break;
991           }
992       }
993 
994       free(testorders);
995       free(orders);
996 
997       ucol_reset(testiter);
998 
999       /* ensures closing of elements done properly to clear writable buffer */
1000       ucol_next(testiter, &status);
1001       ucol_next(testiter, &status);
1002       ucol_closeElements(testiter);
1003       ucol_closeElements(iter);
1004       ucol_close(coll);
1005     } else {
1006       log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status));
1007     }
1008 }
1009 
1010 /**
1011 * Sniplets of code from genuca
1012 */
hex2num(char hex)1013 static int32_t hex2num(char hex) {
1014     if(hex>='0' && hex <='9') {
1015         return hex-'0';
1016     } else if(hex>='a' && hex<='f') {
1017         return hex-'a'+10;
1018     } else if(hex>='A' && hex<='F') {
1019         return hex-'A'+10;
1020     } else {
1021         return 0;
1022     }
1023 }
1024 
1025 /**
1026 * Getting codepoints from a string
1027 * @param str character string contain codepoints seperated by space and ended
1028 *        by a semicolon
1029 * @param codepoints array for storage, assuming size > 5
1030 * @return position at the end of the codepoint section
1031 */
getCodePoints(char * str,UChar * codepoints,UChar * contextCPs)1032 static char *getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) {
1033     UErrorCode errorCode = U_ZERO_ERROR;
1034     char *semi = uprv_strchr(str, ';');
1035     char *pipe = uprv_strchr(str, '|');
1036     char *s;
1037     *codepoints = 0;
1038     *contextCPs = 0;
1039     if(semi == NULL) {
1040         log_err("expected semicolon after code point string in FractionalUCA.txt %s\n", str);
1041         return str;
1042     }
1043     if(pipe != NULL) {
1044         int32_t contextLength;
1045         *pipe = 0;
1046         contextLength = u_parseString(str, contextCPs, 99, NULL, &errorCode);
1047         *pipe = '|';
1048         if(U_FAILURE(errorCode)) {
1049             log_err("error parsing precontext string from FractionalUCA.txt %s\n", str);
1050             return str;
1051         }
1052         /* prepend the precontext string to the codepoints */
1053         u_memcpy(codepoints, contextCPs, contextLength);
1054         codepoints += contextLength;
1055         /* start of the code point string */
1056         s = pipe + 1;
1057     } else {
1058         s = str;
1059     }
1060     u_parseString(s, codepoints, 99, NULL, &errorCode);
1061     if(U_FAILURE(errorCode)) {
1062         log_err("error parsing code point string from FractionalUCA.txt %s\n", str);
1063         return str;
1064     }
1065     return semi + 1;
1066 }
1067 
1068 /**
1069 * Sniplets of code from genuca
1070 */
1071 static int32_t
readElement(char ** from,char * to,char separator,UErrorCode * status)1072 readElement(char **from, char *to, char separator, UErrorCode *status)
1073 {
1074     if (U_SUCCESS(*status)) {
1075         char    buffer[1024];
1076         int32_t i = 0;
1077         while (**from != separator) {
1078             if (**from != ' ') {
1079                 *(buffer+i++) = **from;
1080             }
1081             (*from)++;
1082         }
1083         (*from)++;
1084         *(buffer + i) = 0;
1085         strcpy(to, buffer);
1086         return i/2;
1087     }
1088 
1089     return 0;
1090 }
1091 
1092 /**
1093 * Sniplets of code from genuca
1094 */
1095 static uint32_t
getSingleCEValue(char * primary,char * secondary,char * tertiary,UErrorCode * status)1096 getSingleCEValue(char *primary, char *secondary, char *tertiary,
1097                           UErrorCode *status)
1098 {
1099     if (U_SUCCESS(*status)) {
1100         uint32_t  value    = 0;
1101         char      primsave = '\0';
1102         char      secsave  = '\0';
1103         char      tersave  = '\0';
1104         char     *primend  = primary+4;
1105         char     *secend   = secondary+2;
1106         char     *terend   = tertiary+2;
1107         uint32_t  primvalue;
1108         uint32_t  secvalue;
1109         uint32_t  tervalue;
1110 
1111         if (uprv_strlen(primary) > 4) {
1112             primsave = *primend;
1113             *primend = '\0';
1114         }
1115 
1116         if (uprv_strlen(secondary) > 2) {
1117             secsave = *secend;
1118             *secend = '\0';
1119         }
1120 
1121         if (uprv_strlen(tertiary) > 2) {
1122             tersave = *terend;
1123             *terend = '\0';
1124         }
1125 
1126         primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0;
1127         secvalue  = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0;
1128         tervalue  = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0;
1129         if(primvalue <= 0xFF) {
1130           primvalue <<= 8;
1131         }
1132 
1133         value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK)
1134            | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK)
1135            | (tervalue & UCOL_TERTIARYORDERMASK);
1136 
1137         if(primsave!='\0') {
1138             *primend = primsave;
1139         }
1140         if(secsave!='\0') {
1141             *secend = secsave;
1142         }
1143         if(tersave!='\0') {
1144             *terend = tersave;
1145         }
1146         return value;
1147     }
1148     return 0;
1149 }
1150 
1151 /**
1152 * Getting collation elements generated from a string
1153 * @param str character string contain collation elements contained in [] and
1154 *        seperated by space
1155 * @param ce array for storage, assuming size > 20
1156 * @param status error status
1157 * @return position at the end of the codepoint section
1158 */
getCEs(char * str,uint32_t * ces,UErrorCode * status)1159 static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) {
1160     char       *pStartCP     = uprv_strchr(str, '[');
1161     int         count        = 0;
1162     char       *pEndCP;
1163     char        primary[100];
1164     char        secondary[100];
1165     char        tertiary[100];
1166 
1167     while (*pStartCP == '[') {
1168         uint32_t primarycount   = 0;
1169         uint32_t secondarycount = 0;
1170         uint32_t tertiarycount  = 0;
1171         uint32_t CEi = 1;
1172         pEndCP = strchr(pStartCP, ']');
1173         if(pEndCP == NULL) {
1174             break;
1175         }
1176         pStartCP ++;
1177 
1178         primarycount   = readElement(&pStartCP, primary, ',', status);
1179         secondarycount = readElement(&pStartCP, secondary, ',', status);
1180         tertiarycount  = readElement(&pStartCP, tertiary, ']', status);
1181 
1182         /* I want to get the CEs entered right here, including continuation */
1183         ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status);
1184         if (U_FAILURE(*status)) {
1185             break;
1186         }
1187 
1188         while (2 * CEi < primarycount || CEi < secondarycount ||
1189                CEi < tertiarycount) {
1190             uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
1191             if (2 * CEi < primarycount) {
1192                 value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28);
1193                 value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24);
1194             }
1195 
1196             if (2 * CEi + 1 < primarycount) {
1197                 value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20);
1198                 value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16);
1199             }
1200 
1201             if (CEi < secondarycount) {
1202                 value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12);
1203                 value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8);
1204             }
1205 
1206             if (CEi < tertiarycount) {
1207                 value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4);
1208                 value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF);
1209             }
1210 
1211             CEi ++;
1212             ces[count ++] = value;
1213         }
1214 
1215       pStartCP = pEndCP + 1;
1216     }
1217     ces[count] = 0;
1218     return pStartCP;
1219 }
1220 
1221 /**
1222 * Getting the FractionalUCA.txt file stream
1223 */
getFractionalUCA(void)1224 static FileStream * getFractionalUCA(void)
1225 {
1226     char        newPath[256];
1227     char        backupPath[256];
1228     FileStream *result = NULL;
1229 
1230     /* Look inside ICU_DATA first */
1231     uprv_strcpy(newPath, ctest_dataSrcDir());
1232     uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING );
1233     uprv_strcat(newPath, "FractionalUCA.txt");
1234 
1235     /* As a fallback, try to guess where the source data was located
1236      *   at the time ICU was built, and look there.
1237      */
1238 #if defined (U_TOPSRCDIR)
1239     strcpy(backupPath, U_TOPSRCDIR  U_FILE_SEP_STRING "data");
1240 #else
1241     {
1242         UErrorCode errorCode = U_ZERO_ERROR;
1243         strcpy(backupPath, loadTestData(&errorCode));
1244         strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data");
1245     }
1246 #endif
1247     strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "FractionalUCA.txt");
1248 
1249     result = T_FileStream_open(newPath, "rb");
1250 
1251     if (result == NULL) {
1252         result = T_FileStream_open(backupPath, "rb");
1253         if (result == NULL) {
1254             log_err("Failed to open either %s or %s\n", newPath, backupPath);
1255         }
1256     }
1257     return result;
1258 }
1259 
1260 /**
1261 * Testing the CEs returned by the iterator
1262 */
TestCEs()1263 static void TestCEs() {
1264     FileStream *file = NULL;
1265     char        line[2048];
1266     char       *str;
1267     UChar       codepoints[10];
1268     uint32_t    ces[20];
1269     UErrorCode  status = U_ZERO_ERROR;
1270     UCollator          *coll = ucol_open("", &status);
1271     uint32_t lineNo = 0;
1272     UChar       contextCPs[5];
1273 
1274     if (U_FAILURE(status)) {
1275         log_err_status(status, "Error in opening root collator -> %s\n", u_errorName(status));
1276         return;
1277     }
1278 
1279     file = getFractionalUCA();
1280 
1281     if (file == NULL) {
1282         log_err("*** unable to open input FractionalUCA.txt file ***\n");
1283         return;
1284     }
1285 
1286 
1287     while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1288         int                 count = 0;
1289         UCollationElements *iter;
1290         int32_t            preContextCeLen=0;
1291         lineNo++;
1292         /* skip this line if it is empty or a comment or is a return value
1293         or start of some variable section */
1294         if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1295             line[0] == 0x000D || line[0] == '[') {
1296             continue;
1297         }
1298 
1299         str = getCodePoints(line, codepoints, contextCPs);
1300 
1301         /* these are 'fake' codepoints in the fractional UCA, and are used just
1302          * for positioning of indirect values. They should not go through this
1303          * test.
1304          */
1305         if(*codepoints == 0xFDD0) {
1306           continue;
1307         }
1308         if (*contextCPs != 0) {
1309             iter = ucol_openElements(coll, contextCPs, -1, &status);
1310             if (U_FAILURE(status)) {
1311                 log_err("Error in opening collation elements\n");
1312                 break;
1313             }
1314             while((ces[preContextCeLen] = ucol_next(iter, &status)) != (uint32_t)UCOL_NULLORDER) {
1315                 preContextCeLen++;
1316             }
1317             ucol_closeElements(iter);
1318         }
1319 
1320         getCEs(str, ces+preContextCeLen, &status);
1321         if (U_FAILURE(status)) {
1322             log_err("Error in parsing collation elements in FractionalUCA.txt\n");
1323             break;
1324         }
1325         iter = ucol_openElements(coll, codepoints, -1, &status);
1326         if (U_FAILURE(status)) {
1327             log_err("Error in opening collation elements\n");
1328             break;
1329         }
1330         for (;;) {
1331             uint32_t ce = (uint32_t)ucol_next(iter, &status);
1332             if (ce == 0xFFFFFFFF) {
1333                 ce = 0;
1334             }
1335             /* we now unconditionally reorder Thai/Lao prevowels, so this
1336              * test would fail if we don't skip here.
1337              */
1338             if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) {
1339               continue;
1340             }
1341             if (ce != ces[count] || U_FAILURE(status)) {
1342                 log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n");
1343                 break;
1344             }
1345             if (ces[count] == 0) {
1346                 break;
1347             }
1348             count ++;
1349         }
1350         ucol_closeElements(iter);
1351     }
1352 
1353     T_FileStream_close(file);
1354     ucol_close(coll);
1355 }
1356 
1357 /**
1358 * Testing the discontigous contractions
1359 */
TestDiscontiguos()1360 static void TestDiscontiguos() {
1361     const char               *rulestr    =
1362                             "&z < AB < X\\u0300 < ABC < X\\u0300\\u0315";
1363           UChar               rule[50];
1364           int                 rulelen = u_unescape(rulestr, rule, 50);
1365     const char               *src[] = {
1366      "ADB", "ADBC", "A\\u0315B", "A\\u0315BC",
1367     /* base character blocked */
1368      "XD\\u0300", "XD\\u0300\\u0315",
1369     /* non blocking combining character */
1370      "X\\u0319\\u0300", "X\\u0319\\u0300\\u0315",
1371      /* blocking combining character */
1372      "X\\u0314\\u0300", "X\\u0314\\u0300\\u0315",
1373      /* contraction prefix */
1374      "ABDC", "AB\\u0315C","X\\u0300D\\u0315", "X\\u0300\\u0319\\u0315",
1375      "X\\u0300\\u031A\\u0315",
1376      /* ends not with a contraction character */
1377      "X\\u0319\\u0300D", "X\\u0319\\u0300\\u0315D", "X\\u0300D\\u0315D",
1378      "X\\u0300\\u0319\\u0315D", "X\\u0300\\u031A\\u0315D"
1379     };
1380     const char               *tgt[] = {
1381      /* non blocking combining character */
1382      "A D B", "A D BC", "A \\u0315 B", "A \\u0315 BC",
1383     /* base character blocked */
1384      "X D \\u0300", "X D \\u0300\\u0315",
1385     /* non blocking combining character */
1386      "X\\u0300 \\u0319", "X\\u0300\\u0315 \\u0319",
1387      /* blocking combining character */
1388      "X \\u0314 \\u0300", "X \\u0314 \\u0300\\u0315",
1389      /* contraction prefix */
1390      "AB DC", "AB \\u0315 C","X\\u0300 D \\u0315", "X\\u0300\\u0315 \\u0319",
1391      "X\\u0300 \\u031A \\u0315",
1392      /* ends not with a contraction character */
1393      "X\\u0300 \\u0319D", "X\\u0300\\u0315 \\u0319D", "X\\u0300 D\\u0315D",
1394      "X\\u0300\\u0315 \\u0319D", "X\\u0300 \\u031A\\u0315D"
1395     };
1396           int                 size   = 20;
1397           UCollator          *coll;
1398           UErrorCode          status    = U_ZERO_ERROR;
1399           int                 count     = 0;
1400           UCollationElements *iter;
1401           UCollationElements *resultiter;
1402 
1403     coll       = ucol_openRules(rule, rulelen, UCOL_OFF, UCOL_DEFAULT_STRENGTH,NULL, &status);
1404     iter       = ucol_openElements(coll, rule, 1, &status);
1405     resultiter = ucol_openElements(coll, rule, 1, &status);
1406 
1407     if (U_FAILURE(status)) {
1408         log_err_status(status, "Error opening collation rules -> %s\n", u_errorName(status));
1409         return;
1410     }
1411 
1412     while (count < size) {
1413         UChar  str[20];
1414         UChar  tstr[20];
1415         int    strLen = u_unescape(src[count], str, 20);
1416         UChar *s;
1417 
1418         ucol_setText(iter, str, strLen, &status);
1419         if (U_FAILURE(status)) {
1420             log_err("Error opening collation iterator\n");
1421             return;
1422         }
1423 
1424         u_unescape(tgt[count], tstr, 20);
1425         s = tstr;
1426 
1427         log_verbose("count %d\n", count);
1428 
1429         for (;;) {
1430             uint32_t  ce;
1431             UChar    *e = u_strchr(s, 0x20);
1432             if (e == 0) {
1433                 e = u_strchr(s, 0);
1434             }
1435             ucol_setText(resultiter, s, (int32_t)(e - s), &status);
1436             ce = ucol_next(resultiter, &status);
1437             if (U_FAILURE(status)) {
1438                 log_err("Error manipulating collation iterator\n");
1439                 return;
1440             }
1441             while (ce != UCOL_NULLORDER) {
1442                 if (ce != (uint32_t)ucol_next(iter, &status) ||
1443                     U_FAILURE(status)) {
1444                     log_err("Discontiguos contraction test mismatch\n");
1445                     return;
1446                 }
1447                 ce = ucol_next(resultiter, &status);
1448                 if (U_FAILURE(status)) {
1449                     log_err("Error getting next collation element\n");
1450                     return;
1451                 }
1452             }
1453             s = e + 1;
1454             if (*e == 0) {
1455                 break;
1456             }
1457         }
1458         ucol_reset(iter);
1459         backAndForth(iter);
1460         count ++;
1461     }
1462     ucol_closeElements(resultiter);
1463     ucol_closeElements(iter);
1464     ucol_close(coll);
1465 }
1466 
TestCEBufferOverflow()1467 static void TestCEBufferOverflow()
1468 {
1469     UChar               str[UCOL_EXPAND_CE_BUFFER_SIZE + 1];
1470     UErrorCode          status = U_ZERO_ERROR;
1471     UChar               rule[10];
1472     UCollator          *coll;
1473     UCollationElements *iter;
1474 
1475     u_uastrcpy(rule, "&z < AB");
1476     coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status);
1477     if (U_FAILURE(status)) {
1478         log_err_status(status, "Rule based collator not created for testing ce buffer overflow -> %s\n", u_errorName(status));
1479         return;
1480     }
1481 
1482     /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic
1483     test. this will cause an overflow in getPrev */
1484     str[0] = 0x0041;    /* 'A' */
1485     /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/
1486     uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);
1487     str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042;   /* 'B' */
1488     iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1,
1489                              &status);
1490     if (ucol_previous(iter, &status) == UCOL_NULLORDER ||
1491         status == U_BUFFER_OVERFLOW_ERROR) {
1492         log_err("CE buffer should not overflow with long string of trail surrogates\n");
1493     }
1494     ucol_closeElements(iter);
1495     ucol_close(coll);
1496 }
1497 
1498 /**
1499 * Checking collation element validity.
1500 */
1501 #define MAX_CODEPOINTS_TO_SHOW 10
showCodepoints(const UChar * codepoints,int length,char * codepointText)1502 static void showCodepoints(const UChar *codepoints, int length, char * codepointText) {
1503     int i, lengthToUse = length;
1504     if (lengthToUse > MAX_CODEPOINTS_TO_SHOW) {
1505         lengthToUse = MAX_CODEPOINTS_TO_SHOW;
1506     }
1507     for (i = 0; i < lengthToUse; ++i) {
1508         int bytesWritten = sprintf(codepointText, " %04X", *codepoints++);
1509         if (bytesWritten <= 0) {
1510             break;
1511         }
1512         codepointText += bytesWritten;
1513     }
1514     if (i < length) {
1515         sprintf(codepointText, " ...");
1516     }
1517 }
1518 
checkCEValidity(const UCollator * coll,const UChar * codepoints,int length)1519 static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
1520                              int length)
1521 {
1522     UErrorCode          status = U_ZERO_ERROR;
1523     UCollationElements *iter   = ucol_openElements(coll, codepoints, length,
1524                                                   &status);
1525     UBool result = FALSE;
1526     UBool primaryDone = FALSE, secondaryDone = FALSE, tertiaryDone = FALSE;
1527     const char * collLocale;
1528 
1529     if (U_FAILURE(status)) {
1530         log_err("Error creating iterator for testing validity\n");
1531         return FALSE;
1532     }
1533     collLocale = ucol_getLocale(coll, ULOC_VALID_LOCALE, &status);
1534     if (U_FAILURE(status) || collLocale==NULL) {
1535         status = U_ZERO_ERROR;
1536         collLocale = "?";
1537     }
1538 
1539     for (;;) {
1540         uint32_t ce = ucol_next(iter, &status);
1541         uint32_t primary, p1, p2, secondary, tertiary;
1542         if (ce == UCOL_NULLORDER) {
1543             result = TRUE;
1544             break;
1545         }
1546         if (ce == 0) {
1547             continue;
1548         }
1549         if (ce == 0x02000202) {
1550             /* special CE for merge-sort character */
1551             if (*codepoints == 0xFFFE /* && length == 1 */) {
1552                 /*
1553                  * Note: We should check for length==1 but the token parser appears
1554                  * to give us trailing NUL characters.
1555                  * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet()
1556                  *                     rather than the internal collation rule parser
1557                  */
1558                 continue;
1559             } else {
1560                 log_err("Special 02/02/02 weight for code point U+%04X [len %d] != U+FFFE\n",
1561                         (int)*codepoints, (int)length);
1562                 break;
1563             }
1564         }
1565         primary   = UCOL_PRIMARYORDER(ce);
1566         p1 = primary >> 8;
1567         p2 = primary & 0xFF;
1568         secondary = UCOL_SECONDARYORDER(ce);
1569         tertiary  = UCOL_TERTIARYORDER(ce) & UCOL_REMOVE_CONTINUATION;
1570 
1571         if (!isContinuation(ce)) {
1572             if ((ce & UCOL_REMOVE_CONTINUATION) == 0) {
1573                 log_err("Empty CE %08lX except for case bits\n", (long)ce);
1574                 break;
1575             }
1576             if (p1 == 0) {
1577                 if (p2 != 0) {
1578                     log_err("Primary 00 xx in %08lX\n", (long)ce);
1579                     break;
1580                 }
1581                 primaryDone = TRUE;
1582             } else {
1583                 if (p1 <= 2 || p1 >= 0xF0) {
1584                     /* Primary first bytes F0..FF are specials. */
1585                     log_err("Primary first byte of %08lX out of range\n", (long)ce);
1586                     break;
1587                 }
1588                 if (p2 == 0) {
1589                     primaryDone = TRUE;
1590                 } else {
1591                     if (p2 <= 3 || p2 >= 0xFF) {
1592                         /* Primary second bytes 03 and FF are sort key compression terminators. */
1593                         log_err("Primary second byte of %08lX out of range\n", (long)ce);
1594                         break;
1595                     }
1596                     primaryDone = FALSE;
1597                 }
1598             }
1599             if (secondary == 0) {
1600                 if (primary != 0) {
1601                     log_err("Primary!=0 secondary==0 in %08lX\n", (long)ce);
1602                     break;
1603                 }
1604                 secondaryDone = TRUE;
1605             } else {
1606                 if (secondary <= 2 ||
1607                     (UCOL_BYTE_COMMON < secondary && secondary <= (UCOL_BYTE_COMMON + 0x80))
1608                 ) {
1609                     /* Secondary first bytes common+1..+0x80 are used for sort key compression. */
1610                     log_err("Secondary byte of %08lX out of range\n", (long)ce);
1611                     break;
1612                 }
1613                 secondaryDone = FALSE;
1614             }
1615             if (tertiary == 0) {
1616                 /* We know that ce != 0. */
1617                 log_err("Primary!=0 or secondary!=0 but tertiary==0 in %08lX\n", (long)ce);
1618                 break;
1619             }
1620             if (tertiary <= 2) {
1621                 log_err("Tertiary byte of %08lX out of range\n", (long)ce);
1622                 break;
1623             }
1624             tertiaryDone = FALSE;
1625         } else {
1626             if ((ce & UCOL_REMOVE_CONTINUATION) == 0) {
1627                 log_err("Empty continuation %08lX\n", (long)ce);
1628                 break;
1629             }
1630             if (primaryDone && primary != 0) {
1631                 log_err("Primary was done but continues in %08lX\n", (long)ce);
1632                 break;
1633             }
1634             if (p1 == 0) {
1635                 if (p2 != 0) {
1636                     log_err("Primary 00 xx in %08lX\n", (long)ce);
1637                     break;
1638                 }
1639                 primaryDone = TRUE;
1640             } else {
1641                 if (p1 <= 2) {
1642                     log_err("Primary first byte of %08lX out of range\n", (long)ce);
1643                     break;
1644                 }
1645                 if (p2 == 0) {
1646                     primaryDone = TRUE;
1647                 } else {
1648                     if (p2 <= 3) {
1649                         log_err("Primary second byte of %08lX out of range\n", (long)ce);
1650                         break;
1651                     }
1652                 }
1653             }
1654             if (secondaryDone && secondary != 0) {
1655                 log_err("Secondary was done but continues in %08lX\n", (long)ce);
1656                 break;
1657             }
1658             if (secondary == 0) {
1659                 secondaryDone = TRUE;
1660             } else {
1661                 if (secondary <= 2) {
1662                     log_err("Secondary byte of %08lX out of range\n", (long)ce);
1663                     break;
1664                 }
1665             }
1666             if (tertiaryDone && tertiary != 0) {
1667                 log_err("Tertiary was done but continues in %08lX\n", (long)ce);
1668                 break;
1669             }
1670             if (tertiary == 0) {
1671                 tertiaryDone = TRUE;
1672             } else if (tertiary <= 2) {
1673                 log_err("Tertiary byte of %08lX out of range\n", (long)ce);
1674                 break;
1675             }
1676         }
1677     }
1678     if (!result) {
1679         char codepointText[5*MAX_CODEPOINTS_TO_SHOW + 5];
1680         showCodepoints(codepoints, length, codepointText);
1681         log_err("Locale: %s  Code point string: %s\n", collLocale, codepointText);
1682     }
1683     ucol_closeElements(iter);
1684     return result;
1685 }
1686 
TestCEValidity()1687 static void TestCEValidity()
1688 {
1689     /* testing UCA collation elements */
1690     UErrorCode  status      = U_ZERO_ERROR;
1691     /* en_US has no tailorings */
1692     UCollator  *coll        = ucol_open("root", &status);
1693     /* tailored locales */
1694     char        locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"};
1695     const char *loc;
1696     FileStream *file = NULL;
1697     char        line[2048];
1698     UChar       codepoints[11];
1699     int         count = 0;
1700     int         maxCount = 0;
1701     UChar       contextCPs[3];
1702     UChar32     c;
1703     UParseError parseError;
1704     if (U_FAILURE(status)) {
1705         log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status));
1706         return;
1707     }
1708     log_verbose("Testing UCA elements\n");
1709     file = getFractionalUCA();
1710     if (file == NULL) {
1711         log_err("Fractional UCA data can not be opened\n");
1712         return;
1713     }
1714 
1715     while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1716         if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1717             line[0] == 0x000D || line[0] == '[') {
1718             continue;
1719         }
1720 
1721         getCodePoints(line, codepoints, contextCPs);
1722         checkCEValidity(coll, codepoints, u_strlen(codepoints));
1723     }
1724 
1725     log_verbose("Testing UCA elements for the whole range of unicode characters\n");
1726     for (c = 0; c <= 0xffff; ++c) {
1727         if (u_isdefined(c)) {
1728             codepoints[0] = (UChar)c;
1729             checkCEValidity(coll, codepoints, 1);
1730         }
1731     }
1732     for (; c <= 0x10ffff; ++c) {
1733         if (u_isdefined(c)) {
1734             int32_t i = 0;
1735             U16_APPEND_UNSAFE(codepoints, i, c);
1736             checkCEValidity(coll, codepoints, i);
1737         }
1738     }
1739 
1740     ucol_close(coll);
1741 
1742     /* testing tailored collation elements */
1743     log_verbose("Testing tailored elements\n");
1744     if(getTestOption(QUICK_OPTION)) {
1745         maxCount = sizeof(locale)/sizeof(locale[0]);
1746     } else {
1747         maxCount = uloc_countAvailable();
1748     }
1749     while (count < maxCount) {
1750         const UChar *rules = NULL,
1751                     *current = NULL;
1752         UChar *rulesCopy = NULL;
1753         int32_t ruleLen = 0;
1754 
1755         uint32_t chOffset = 0;
1756         uint32_t chLen = 0;
1757         uint32_t exOffset = 0;
1758         uint32_t exLen = 0;
1759         uint32_t prefixOffset = 0;
1760         uint32_t prefixLen = 0;
1761         UBool    startOfRules = TRUE;
1762         UColOptionSet opts;
1763 
1764         UColTokenParser src;
1765         uint32_t strength = 0;
1766         uint16_t specs = 0;
1767         if(getTestOption(QUICK_OPTION)) {
1768             loc = locale[count];
1769         } else {
1770             loc = uloc_getAvailable(count);
1771             if(!hasCollationElements(loc)) {
1772                 count++;
1773                 continue;
1774             }
1775         }
1776 
1777         uprv_memset(&src, 0, sizeof(UColTokenParser));
1778 
1779         log_verbose("Testing CEs for %s\n", loc);
1780 
1781         coll      = ucol_open(loc, &status);
1782         if (U_FAILURE(status)) {
1783             log_err("%s collator creation failed\n", loc);
1784             return;
1785         }
1786 
1787         src.opts = &opts;
1788         rules = ucol_getRules(coll, &ruleLen);
1789 
1790         if (ruleLen > 0) {
1791             rulesCopy = (UChar *)uprv_malloc((ruleLen +
1792                 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
1793             uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
1794             src.current = src.source = rulesCopy;
1795             src.end = rulesCopy + ruleLen;
1796             src.extraCurrent = src.end;
1797             src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1798 
1799 	        /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
1800 	           the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
1801             while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL) {
1802               strength = src.parsedToken.strength;
1803               chOffset = src.parsedToken.charsOffset;
1804               chLen = src.parsedToken.charsLen;
1805               exOffset = src.parsedToken.extensionOffset;
1806               exLen = src.parsedToken.extensionLen;
1807               prefixOffset = src.parsedToken.prefixOffset;
1808               prefixLen = src.parsedToken.prefixLen;
1809               specs = src.parsedToken.flags;
1810 
1811                 startOfRules = FALSE;
1812                 uprv_memcpy(codepoints, src.source + chOffset,
1813                                                        chLen * sizeof(UChar));
1814                 codepoints[chLen] = 0;
1815                 checkCEValidity(coll, codepoints, chLen);
1816             }
1817             uprv_free(src.source);
1818         }
1819 
1820         ucol_close(coll);
1821         count ++;
1822     }
1823     T_FileStream_close(file);
1824 }
1825 
printSortKeyError(const UChar * codepoints,int length,uint8_t * sortkey,int sklen)1826 static void printSortKeyError(const UChar   *codepoints, int length,
1827                                     uint8_t *sortkey, int sklen)
1828 {
1829     int count = 0;
1830     log_err("Sortkey not valid for ");
1831     while (length > 0) {
1832         log_err("0x%04x ", *codepoints);
1833         length --;
1834         codepoints ++;
1835     }
1836     log_err("\nSortkey : ");
1837     while (count < sklen) {
1838         log_err("0x%02x ", sortkey[count]);
1839         count ++;
1840     }
1841     log_err("\n");
1842 }
1843 
1844 /**
1845 * Checking sort key validity for all levels
1846 */
checkSortKeyValidity(UCollator * coll,const UChar * codepoints,int length)1847 static UBool checkSortKeyValidity(UCollator *coll,
1848                                   const UChar *codepoints,
1849                                   int length)
1850 {
1851     UErrorCode status  = U_ZERO_ERROR;
1852     UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY,
1853                                       UCOL_TERTIARY, UCOL_QUATERNARY,
1854                                       UCOL_IDENTICAL};
1855     int        strengthlen = 5;
1856     int        strengthIndex = 0;
1857     int        caselevel   = 0;
1858 
1859     while (caselevel < 1) {
1860         if (caselevel == 0) {
1861             ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status);
1862         }
1863         else {
1864             ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status);
1865         }
1866 
1867         while (strengthIndex < strengthlen) {
1868             int        count01 = 0;
1869             uint32_t   count   = 0;
1870             uint8_t    sortkey[128];
1871             uint32_t   sklen;
1872 
1873             ucol_setStrength(coll, strength[strengthIndex]);
1874             sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128);
1875             while (sortkey[count] != 0) {
1876                 if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && strengthIndex != 4)) {
1877                     printSortKeyError(codepoints, length, sortkey, sklen);
1878                     return FALSE;
1879                 }
1880                 if (sortkey[count] == 1) {
1881                     count01 ++;
1882                 }
1883                 count ++;
1884             }
1885 
1886             if (count + 1 != sklen || (count01 != strengthIndex + caselevel)) {
1887                 printSortKeyError(codepoints, length, sortkey, sklen);
1888                 return FALSE;
1889             }
1890             strengthIndex ++;
1891         }
1892         caselevel ++;
1893     }
1894     return TRUE;
1895 }
1896 
TestSortKeyValidity(void)1897 static void TestSortKeyValidity(void)
1898 {
1899     /* testing UCA collation elements */
1900     UErrorCode  status      = U_ZERO_ERROR;
1901     /* en_US has no tailorings */
1902     UCollator  *coll        = ucol_open("en_US", &status);
1903     /* tailored locales */
1904     char        locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"};
1905     FileStream *file = NULL;
1906     char        line[2048];
1907     UChar       codepoints[10];
1908     int         count = 0;
1909     UChar       contextCPs[5];
1910     UParseError parseError;
1911     if (U_FAILURE(status)) {
1912         log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status));
1913         return;
1914     }
1915     log_verbose("Testing UCA elements\n");
1916     file = getFractionalUCA();
1917     if (file == NULL) {
1918         log_err("Fractional UCA data can not be opened\n");
1919         return;
1920     }
1921 
1922     while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
1923         if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
1924             line[0] == 0x000D || line[0] == '[') {
1925             continue;
1926         }
1927 
1928         getCodePoints(line, codepoints, contextCPs);
1929         if(codepoints[0] == 0xFFFE) {
1930             /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
1931             continue;
1932         }
1933         checkSortKeyValidity(coll, codepoints, u_strlen(codepoints));
1934     }
1935 
1936     log_verbose("Testing UCA elements for the whole range of unicode characters\n");
1937     codepoints[0] = 0;
1938 
1939     while (codepoints[0] < 0xFFFF) {
1940         if (u_isdefined((UChar32)codepoints[0])) {
1941             checkSortKeyValidity(coll, codepoints, 1);
1942         }
1943         codepoints[0] ++;
1944     }
1945 
1946     ucol_close(coll);
1947 
1948     /* testing tailored collation elements */
1949     log_verbose("Testing tailored elements\n");
1950     while (count < 5) {
1951         const UChar *rules = NULL,
1952                     *current = NULL;
1953         UChar *rulesCopy = NULL;
1954         int32_t ruleLen = 0;
1955 
1956         uint32_t chOffset = 0;
1957         uint32_t chLen = 0;
1958         uint32_t exOffset = 0;
1959         uint32_t exLen = 0;
1960         uint32_t prefixOffset = 0;
1961         uint32_t prefixLen = 0;
1962         UBool    startOfRules = TRUE;
1963         UColOptionSet opts;
1964 
1965         UColTokenParser src;
1966         uint32_t strength = 0;
1967         uint16_t specs = 0;
1968 
1969         uprv_memset(&src, 0, sizeof(UColTokenParser));
1970 
1971         coll      = ucol_open(locale[count], &status);
1972         if (U_FAILURE(status)) {
1973             log_err("%s collator creation failed\n", locale[count]);
1974             return;
1975         }
1976 
1977         src.opts = &opts;
1978         rules = ucol_getRules(coll, &ruleLen);
1979 
1980         if (ruleLen > 0) {
1981             rulesCopy = (UChar *)uprv_malloc((ruleLen +
1982                 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
1983             uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
1984             src.current = src.source = rulesCopy;
1985             src.end = rulesCopy + ruleLen;
1986             src.extraCurrent = src.end;
1987             src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1988 
1989 	        /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
1990 	           the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
1991             while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL) {
1992                 strength = src.parsedToken.strength;
1993                 chOffset = src.parsedToken.charsOffset;
1994                 chLen = src.parsedToken.charsLen;
1995                 exOffset = src.parsedToken.extensionOffset;
1996                 exLen = src.parsedToken.extensionLen;
1997                 prefixOffset = src.parsedToken.prefixOffset;
1998                 prefixLen = src.parsedToken.prefixLen;
1999                 specs = src.parsedToken.flags;
2000 
2001                 startOfRules = FALSE;
2002                 uprv_memcpy(codepoints, src.source + chOffset,
2003                                                        chLen * sizeof(UChar));
2004                 codepoints[chLen] = 0;
2005                 if(codepoints[0] == 0xFFFE) {
2006                     /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
2007                     continue;
2008                 }
2009                 checkSortKeyValidity(coll, codepoints, chLen);
2010             }
2011             uprv_free(src.source);
2012         }
2013 
2014         ucol_close(coll);
2015         count ++;
2016     }
2017     T_FileStream_close(file);
2018 }
2019 
2020 #endif /* #if !UCONFIG_NO_COLLATION */
2021