• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationtest.cpp
7 *
8 * created on: 2012apr27
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/coll.h"
17 #include "unicode/errorcode.h"
18 #include "unicode/localpointer.h"
19 #include "unicode/normalizer2.h"
20 #include "unicode/sortkey.h"
21 #include "unicode/std_string.h"
22 #include "unicode/strenum.h"
23 #include "unicode/tblcoll.h"
24 #include "unicode/uiter.h"
25 #include "unicode/uniset.h"
26 #include "unicode/unistr.h"
27 #include "unicode/usetiter.h"
28 #include "unicode/ustring.h"
29 #include "charstr.h"
30 #include "cmemory.h"
31 #include "collation.h"
32 #include "collationdata.h"
33 #include "collationfcd.h"
34 #include "collationiterator.h"
35 #include "collationroot.h"
36 #include "collationrootelements.h"
37 #include "collationruleparser.h"
38 #include "collationweights.h"
39 #include "cstring.h"
40 #include "intltest.h"
41 #include "normalizer2impl.h"
42 #include "ucbuf.h"
43 #include "uhash.h"
44 #include "uitercollationiterator.h"
45 #include "utf16collationiterator.h"
46 #include "utf8collationiterator.h"
47 #include "uvectr32.h"
48 #include "uvectr64.h"
49 #include "writesrc.h"
50 
51 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
52 
53 // TODO: Move to ucbuf.h
54 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
55 
56 class CodePointIterator;
57 
58 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
59 
60 class CollationTest : public IntlTest {
61 public:
CollationTest()62     CollationTest()
63             : fcd(NULL), nfd(NULL),
64               fileLineNumber(0),
65               coll(NULL) {}
66 
~CollationTest()67     ~CollationTest() {
68         delete coll;
69     }
70 
71     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
72 
73     void TestMinMax();
74     void TestImplicits();
75     void TestNulTerminated();
76     void TestIllegalUTF8();
77     void TestShortFCDData();
78     void TestFCD();
79     void TestCollationWeights();
80     void TestRootElements();
81     void TestTailoredElements();
82     void TestDataDriven();
83 
84 private:
85     void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
86     void checkAllocWeights(CollationWeights &cw,
87                            uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
88                            int32_t someLength, int32_t minCount);
89 
90     static UnicodeString printSortKey(const uint8_t *p, int32_t length);
91     static UnicodeString printCollationKey(const CollationKey &key);
92 
93     // Helpers & fields for data-driven test.
isCROrLF(UChar c)94     static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)95     static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)96     static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
skipSpaces(int32_t i)97     int32_t skipSpaces(int32_t i) {
98         while(isSpace(fileLine[i])) { ++i; }
99         return i;
100     }
101 
102     UBool readLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
103     void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
104     Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
105     void parseAndSetAttribute(IcuTestErrorCode &errorCode);
106     void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
107     void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
108     void setRootCollator(IcuTestErrorCode &errorCode);
109     void setLocaleCollator(IcuTestErrorCode &errorCode);
110 
111     UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
112 
113     UBool getSortKeyParts(const UChar *s, int32_t length,
114                           CharString &dest, int32_t partSize,
115                           IcuTestErrorCode &errorCode);
116     UBool getCollationKey(const char *norm, const UnicodeString &line,
117                           const UChar *s, int32_t length,
118                           CollationKey &key, IcuTestErrorCode &errorCode);
119     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
120                           const UnicodeString &prevString, const UnicodeString &s,
121                           UCollationResult expectedOrder, Collation::Level expectedLevel,
122                           IcuTestErrorCode &errorCode);
123     void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
124 
125     const Normalizer2 *fcd, *nfd;
126     UnicodeString fileLine;
127     int32_t fileLineNumber;
128     UnicodeString fileTestName;
129     Collator *coll;
130 };
131 
createCollationTest()132 extern IntlTest *createCollationTest() {
133     return new CollationTest();
134 }
135 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)136 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
137     if(exec) {
138         logln("TestSuite CollationTest: ");
139     }
140     TESTCASE_AUTO_BEGIN;
141     TESTCASE_AUTO(TestMinMax);
142     TESTCASE_AUTO(TestImplicits);
143     TESTCASE_AUTO(TestNulTerminated);
144     TESTCASE_AUTO(TestIllegalUTF8);
145     TESTCASE_AUTO(TestShortFCDData);
146     TESTCASE_AUTO(TestFCD);
147     TESTCASE_AUTO(TestCollationWeights);
148     TESTCASE_AUTO(TestRootElements);
149     TESTCASE_AUTO(TestTailoredElements);
150     TESTCASE_AUTO(TestDataDriven);
151     TESTCASE_AUTO_END;
152 }
153 
TestMinMax()154 void CollationTest::TestMinMax() {
155     IcuTestErrorCode errorCode(*this, "TestMinMax");
156 
157     setRootCollator(errorCode);
158     if(errorCode.isFailure()) {
159         errorCode.reset();
160         return;
161     }
162     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
163     if(rbc == NULL) {
164         errln("the root collator is not a RuleBasedCollator");
165         return;
166     }
167 
168     static const UChar s[2] = { 0xfffe, 0xffff };
169     UVector64 ces(errorCode);
170     rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
171     errorCode.assertSuccess();
172     if(ces.size() != 2) {
173         errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
174         return;
175     }
176     int64_t ce = ces.elementAti(0);
177     int64_t expected =
178         ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) |
179         Collation::MERGE_SEPARATOR_LOWER32;
180     if(ce != expected) {
181         errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce);
182     }
183 
184     ce = ces.elementAti(1);
185     expected = Collation::makeCE(Collation::MAX_PRIMARY);
186     if(ce != expected) {
187         errln("CE(U+ffff)=%04lx != max..", (long)ce);
188     }
189 }
190 
TestImplicits()191 void CollationTest::TestImplicits() {
192     IcuTestErrorCode errorCode(*this, "TestImplicits");
193 
194     const CollationData *cd = CollationRoot::getData(errorCode);
195     if(errorCode.logDataIfFailureAndReset("CollationRoot::getBaseData()")) {
196         return;
197     }
198 
199     // Implicit primary weights should be assigned for the following sets,
200     // and sort in ascending order by set and then code point.
201     // See http://www.unicode.org/reports/tr10/#Implicit_Weights
202     // core Han Unified Ideographs
203     UnicodeSet coreHan("[\\p{unified_ideograph}&"
204                             "[\\p{Block=CJK_Unified_Ideographs}"
205                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
206                        errorCode);
207     // all other Unified Han ideographs
208     UnicodeSet otherHan("[\\p{unified ideograph}-"
209                             "[\\p{Block=CJK_Unified_Ideographs}"
210                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
211                         errorCode);
212     UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
213     unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
214     if(errorCode.logIfFailureAndReset("UnicodeSet")) {
215         return;
216     }
217     const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
218     UChar32 prev = 0;
219     uint32_t prevPrimary = 0;
220     UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
221     for(int32_t i = 0; i < LENGTHOF(sets); ++i) {
222         LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
223         while(iter->next()) {
224             UChar32 c = iter->getCodepoint();
225             UnicodeString s(c);
226             ci.setText(s.getBuffer(), s.getBuffer() + s.length());
227             int64_t ce = ci.nextCE(errorCode);
228             int64_t ce2 = ci.nextCE(errorCode);
229             if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
230                 return;
231             }
232             if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
233                 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
234                 continue;
235             }
236             if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
237                 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
238                       (long)c, (long)(ce & 0xffffffff));
239                 continue;
240             }
241             uint32_t primary = (uint32_t)(ce >> 32);
242             if(!(primary > prevPrimary)) {
243                 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
244                       (long)c, (long)primary, (long)prev, (long)prevPrimary);
245             }
246             prev = c;
247             prevPrimary = primary;
248         }
249     }
250 }
251 
TestNulTerminated()252 void CollationTest::TestNulTerminated() {
253     IcuTestErrorCode errorCode(*this, "TestNulTerminated");
254     const CollationData *data = CollationRoot::getData(errorCode);
255     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
256         return;
257     }
258 
259     static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
260 
261     UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
262     UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
263     for(int32_t i = 0;; ++i) {
264         int64_t ce1 = ci1.nextCE(errorCode);
265         int64_t ce2 = ci2.nextCE(errorCode);
266         if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
267             return;
268         }
269         if(ce1 != ce2) {
270             errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
271             break;
272         }
273         if(ce1 == Collation::NO_CE) { break; }
274     }
275 }
276 
TestIllegalUTF8()277 void CollationTest::TestIllegalUTF8() {
278     IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
279 
280     setRootCollator(errorCode);
281     if(errorCode.isFailure()) {
282         errorCode.reset();
283         return;
284     }
285     coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
286 
287     static const char *strings[] = {
288         // U+FFFD
289         "a\xef\xbf\xbdz",
290         // illegal byte sequences
291         "a\x80z",  // trail byte
292         "a\xc1\x81z",  // non-shortest form
293         "a\xe0\x82\x83z",  // non-shortest form
294         "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
295         "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
296         "a\xf0\x8f\xbf\xbfz",  // non-shortest form
297         "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
298     };
299 
300     StringPiece fffd(strings[0]);
301     for(int32_t i = 1; i < LENGTHOF(strings); ++i) {
302         StringPiece illegal(strings[i]);
303         UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
304         if(order != UCOL_EQUAL) {
305             errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
306                   (int)i, order);
307         }
308     }
309 }
310 
311 namespace {
312 
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)313 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
314     for(UChar32 c = 0x10000; c < 0x110000;) {
315         UChar32 next = c + 0x400;
316         if(src.containsSome(c, next - 1)) {
317             dest.add(U16_LEAD(c));
318         }
319         c = next;
320     }
321 }
322 
323 }  // namespace
324 
TestShortFCDData()325 void CollationTest::TestShortFCDData() {
326     // See CollationFCD class comments.
327     IcuTestErrorCode errorCode(*this, "TestShortFCDData");
328     UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
329     errorCode.assertSuccess();
330     expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
331     addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
332     UnicodeSet lccc;  // actual
333     for(UChar32 c = 0; c <= 0xffff; ++c) {
334         if(CollationFCD::hasLccc(c)) { lccc.add(c); }
335     }
336     UnicodeSet diff(expectedLccc);
337     diff.removeAll(lccc);
338     diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
339     UnicodeString empty("[]");
340     UnicodeString diffString;
341     diff.toPattern(diffString, TRUE);
342     assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
343     diff = lccc;
344     diff.removeAll(expectedLccc);
345     diff.toPattern(diffString, TRUE);
346     assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
347 
348     UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
349     if (errorCode.isSuccess()) {
350         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
351         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
352         UnicodeSet tccc;  // actual
353         for(UChar32 c = 0; c <= 0xffff; ++c) {
354             if(CollationFCD::hasTccc(c)) { tccc.add(c); }
355         }
356         diff = expectedTccc;
357         diff.removeAll(tccc);
358         diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
359         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
360         diff = tccc;
361         diff.removeAll(expectedTccc);
362         diff.toPattern(diffString, TRUE);
363         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
364     }
365 }
366 
367 class CodePointIterator {
368 public:
CodePointIterator(const UChar32 * cp,int32_t length)369     CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()370     void resetToStart() { pos = 0; }
next()371     UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()372     UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const373     int32_t getLength() const { return length; }
getIndex() const374     int getIndex() const { return (int)pos; }
375 private:
376     const UChar32 *cp;
377     int32_t length;
378     int32_t pos;
379 };
380 
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)381 void CollationTest::checkFCD(const char *name,
382                              CollationIterator &ci, CodePointIterator &cpi) {
383     IcuTestErrorCode errorCode(*this, "checkFCD");
384 
385     // Iterate forward to the limit.
386     for(;;) {
387         UChar32 c1 = ci.nextCodePoint(errorCode);
388         UChar32 c2 = cpi.next();
389         if(c1 != c2) {
390             errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
391                   name, (long)c1, (long)c2, cpi.getIndex());
392             return;
393         }
394         if(c1 < 0) { break; }
395     }
396 
397     // Iterate backward most of the way.
398     for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
399         UChar32 c1 = ci.previousCodePoint(errorCode);
400         UChar32 c2 = cpi.previous();
401         if(c1 != c2) {
402             errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
403                   name, (long)c1, (long)c2, cpi.getIndex());
404             return;
405         }
406     }
407 
408     // Forward again.
409     for(;;) {
410         UChar32 c1 = ci.nextCodePoint(errorCode);
411         UChar32 c2 = cpi.next();
412         if(c1 != c2) {
413             errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
414                   name, (long)c1, (long)c2, cpi.getIndex());
415             return;
416         }
417         if(c1 < 0) { break; }
418     }
419 
420     // Iterate backward to the start.
421     for(;;) {
422         UChar32 c1 = ci.previousCodePoint(errorCode);
423         UChar32 c2 = cpi.previous();
424         if(c1 != c2) {
425             errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
426                   name, (long)c1, (long)c2, cpi.getIndex());
427             return;
428         }
429         if(c1 < 0) { break; }
430     }
431 }
432 
TestFCD()433 void CollationTest::TestFCD() {
434     IcuTestErrorCode errorCode(*this, "TestFCD");
435     const CollationData *data = CollationRoot::getData(errorCode);
436     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
437         return;
438     }
439 
440     // Input string, not FCD, NUL-terminated.
441     static const UChar s[] = {
442         0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
443         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
444         0x327, 0x308,  // ccc=202, 230
445         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
446         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
447         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
448         0xac01,
449         0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
450         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
451         0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
452         0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
453         0x4e00, 0xf81,
454         0
455     };
456     // Expected code points.
457     static const UChar32 cp[] = {
458         0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
459         0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
460         0x1D15F, 0x1D16D,
461         0xac01,
462         0x63, 0x327, 0x1D165, 0x1D16D,
463         0x61,
464         0xf71, 0xf71, 0xf72, 0xf74, 0x301,
465         0x4e00, 0xf71, 0xf80
466     };
467 
468     FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
469     if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
470         return;
471     }
472     CodePointIterator cpi(cp, LENGTHOF(cp));
473     checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
474 
475 #if U_HAVE_STD_STRING
476     cpi.resetToStart();
477     std::string utf8;
478     UnicodeString(s).toUTF8String(utf8);
479     FCDUTF8CollationIterator u8ci(data, FALSE,
480                                   reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
481     if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
482         return;
483     }
484     checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
485 #endif
486 
487     cpi.resetToStart();
488     UCharIterator iter;
489     uiter_setString(&iter, s, LENGTHOF(s) - 1);  // -1: without the terminating NUL
490     FCDUIterCollationIterator uici(data, FALSE, iter, 0);
491     if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
492         return;
493     }
494     checkFCD("FCDUIterCollationIterator", uici, cpi);
495 }
496 
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)497 void CollationTest::checkAllocWeights(CollationWeights &cw,
498                                       uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
499                                       int32_t someLength, int32_t minCount) {
500     if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
501         errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
502               (long)lowerLimit, (long)upperLimit, (long)n);
503         return;
504     }
505     uint32_t previous = lowerLimit;
506     int32_t count = 0;  // number of weights that have someLength
507     for(int32_t i = 0; i < n; ++i) {
508         uint32_t w = cw.nextWeight();
509         if(w == 0xffffffff) {
510             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
511                   "returns only %ld weights",
512                   (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
513             return;
514         }
515         if(!(previous < w && w < upperLimit)) {
516             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
517                   "number %ld -> %lx not between %lx and %lx",
518                   (long)lowerLimit, (long)upperLimit, (long)n,
519                   (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
520             return;
521         }
522         if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
523     }
524     if(count < minCount) {
525         errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
526               "returns only %ld < %ld weights of length %d",
527               (long)lowerLimit, (long)upperLimit, (long)n,
528               (long)count, (long)minCount, (int)someLength);
529     }
530 }
531 
TestCollationWeights()532 void CollationTest::TestCollationWeights() {
533     CollationWeights cw;
534 
535     // Non-compressible primaries use 254 second bytes 02..FF.
536     logln("CollationWeights.initForPrimary(non-compressible)");
537     cw.initForPrimary(FALSE);
538     // Expect 1 weight 11 and 254 weights 12xx.
539     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
540     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
541     // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
542     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
543     // Expect 254 two-byte weights from the ranges 10ff and 11xx.
544     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
545     // Expect 254^2=64516 three-byte weights.
546     // During computation, there should be 3 three-byte ranges
547     // 10ffff, 11xxxx, 120202.
548     // The middle one should be split 64515:1,
549     // and the newly-split-off range and the last ranged lengthened.
550     checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
551     // Expect weights 1102 & 1103.
552     checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
553     // Expect weights 102102 & 102103.
554     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
555 
556     // Compressible primaries use 251 second bytes 04..FE.
557     logln("CollationWeights.initForPrimary(compressible)");
558     cw.initForPrimary(TRUE);
559     // Expect 1 weight 11 and 251 weights 12xx.
560     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
561     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
562     // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
563     checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
564     // Expect weights 1104 & 1105.
565     checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
566     // Expect weights 102102 & 102103.
567     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
568 
569     // Secondary and tertiary weights use only bytes 3 & 4.
570     logln("CollationWeights.initForSecondary()");
571     cw.initForSecondary();
572     // Expect weights fbxx and all four fc..ff.
573     checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
574 
575     logln("CollationWeights.initForTertiary()");
576     cw.initForTertiary();
577     // Expect weights 3dxx and both 3e & 3f.
578     checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
579 }
580 
581 namespace {
582 
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)583 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
584                 uint32_t p, uint32_t s, uint32_t ctq) {
585     uint32_t p1 = p >> 24;
586     uint32_t p2 = (p >> 16) & 0xff;
587     uint32_t p3 = (p >> 8) & 0xff;
588     uint32_t p4 = p & 0xff;
589     uint32_t s1 = s >> 8;
590     uint32_t s2 = s & 0xff;
591     // ctq = Case, Tertiary, Quaternary
592     uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
593     uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
594     uint32_t t1 = t >> 8;
595     uint32_t t2 = t & 0xff;
596     uint32_t q = ctq & Collation::QUATERNARY_MASK;
597     // No leading zero bytes.
598     if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
599         return FALSE;
600     }
601     // No intermediate zero bytes.
602     if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
603         return FALSE;
604     }
605     if(p2 != 0 && p3 == 0 && p4 != 0) {
606         return FALSE;
607     }
608     // Minimum & maximum lead bytes.
609     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
610             (s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) ||
611             (t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) {
612         return FALSE;
613     }
614     if(t1 != 0 && t1 > 0x3f) {
615         return FALSE;
616     }
617     if(c > 2) {
618         return FALSE;
619     }
620     // The valid byte range for the second primary byte depends on compressibility.
621     if(p2 != 0) {
622         if(data.isCompressibleLeadByte(p1)) {
623             if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
624                     Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
625                 return FALSE;
626             }
627         } else {
628             if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
629                 return FALSE;
630             }
631         }
632     }
633     // Other bytes just need to avoid the level separator.
634     // Trailing zeros are ok.
635     U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
636     if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
637             s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
638         return FALSE;
639     }
640     // Well-formed CEs.
641     if(p == 0) {
642         if(s == 0) {
643             if(t == 0) {
644                 // Completely ignorable CE.
645                 // Quaternary CEs are not supported.
646                 if(c != 0 || q != 0) {
647                     return FALSE;
648                 }
649             } else {
650                 // Tertiary CE.
651                 if(t < re.getTertiaryBoundary() || c != 2) {
652                     return FALSE;
653                 }
654             }
655         } else {
656             // Secondary CE.
657             if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
658                 return FALSE;
659             }
660         }
661     } else {
662         // Primary CE.
663         if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
664                 s >= re.getSecondaryBoundary()) {
665             return FALSE;
666         }
667         if(t == 0 || t >= re.getTertiaryBoundary()) {
668             return FALSE;
669         }
670     }
671     return TRUE;
672 }
673 
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)674 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
675     uint32_t p = (uint32_t)(ce >> 32);
676     uint32_t secTer = (uint32_t)ce;
677     return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
678 }
679 
680 class RootElementsIterator {
681 public:
RootElementsIterator(const CollationData & root)682     RootElementsIterator(const CollationData &root)
683             : data(root),
684               elements(root.rootElements), length(root.rootElementsLength),
685               pri(0), secTer(0),
686               index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
687 
next()688     UBool next() {
689         if(index >= length) { return FALSE; }
690         uint32_t p = elements[index];
691         if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
692         if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
693             ++index;
694             secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
695             return TRUE;
696         }
697         if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
698             // End of a range, enumerate the primaries in the range.
699             int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
700             p &= 0xffffff00;
701             if(pri == p) {
702                 // Finished the range, return the next CE after it.
703                 ++index;
704                 return next();
705             }
706             U_ASSERT(pri < p);
707             // Return the next primary in this range.
708             UBool isCompressible = data.isCompressiblePrimary(pri);
709             if((pri & 0xffff) == 0) {
710                 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
711             } else {
712                 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
713             }
714             return TRUE;
715         }
716         // Simple primary CE.
717         ++index;
718         pri = p;
719         secTer = Collation::COMMON_SEC_AND_TER_CE;
720         return TRUE;
721     }
722 
getPrimary() const723     uint32_t getPrimary() const { return pri; }
getSecTer() const724     uint32_t getSecTer() const { return secTer; }
725 
726 private:
727     const CollationData &data;
728     const uint32_t *elements;
729     int32_t length;
730 
731     uint32_t pri;
732     uint32_t secTer;
733     int32_t index;
734 };
735 
736 }  // namespace
737 
TestRootElements()738 void CollationTest::TestRootElements() {
739     IcuTestErrorCode errorCode(*this, "TestRootElements");
740     const CollationData *root = CollationRoot::getData(errorCode);
741     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
742         return;
743     }
744     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
745     RootElementsIterator iter(*root);
746 
747     // We check each root CE for validity,
748     // and we also verify that there is a tailoring gap between each two CEs.
749     CollationWeights cw1c;  // compressible primary weights
750     CollationWeights cw1u;  // uncompressible primary weights
751     CollationWeights cw2;
752     CollationWeights cw3;
753 
754     cw1c.initForPrimary(TRUE);
755     cw1u.initForPrimary(FALSE);
756     cw2.initForSecondary();
757     cw3.initForTertiary();
758 
759     // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
760     // nor the special merge-separator CE for U+FFFE.
761     uint32_t prevPri = 0;
762     uint32_t prevSec = 0;
763     uint32_t prevTer = 0;
764     while(iter.next()) {
765         uint32_t pri = iter.getPrimary();
766         uint32_t secTer = iter.getSecTer();
767         // CollationRootElements CEs must have 0 case and quaternary bits.
768         if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
769             errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
770                   (long)pri, (long)secTer);
771         }
772         uint32_t sec = secTer >> 16;
773         uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
774         uint32_t ctq = ter;
775         if(pri == 0 && sec == 0 && ter != 0) {
776             // Tertiary CEs must have uppercase bits,
777             // but they are not stored in the CollationRootElements.
778             ctq |= 0x8000;
779         }
780         if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
781             errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
782         } else {
783             if(pri != prevPri) {
784                 uint32_t newWeight = 0;
785                 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
786                     // There is currently no tailoring gap after primary ignorables,
787                     // and we forbid tailoring after U+FFFD and U+FFFF.
788                 } else if(root->isCompressiblePrimary(prevPri)) {
789                     if(!cw1c.allocWeights(prevPri, pri, 1)) {
790                         errln("no primary/compressible tailoring gap between %08lx and %08lx",
791                               (long)prevPri, (long)pri);
792                     } else {
793                         newWeight = cw1c.nextWeight();
794                     }
795                 } else {
796                     if(!cw1u.allocWeights(prevPri, pri, 1)) {
797                         errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
798                               (long)prevPri, (long)pri);
799                     } else {
800                         newWeight = cw1u.nextWeight();
801                     }
802                 }
803                 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
804                     errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
805                           (long)prevPri, (long)newWeight, (long)pri);
806                 }
807             } else if(sec != prevSec) {
808                 uint32_t lowerLimit =
809                     prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
810                 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
811                     errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
812                 } else {
813                     uint32_t newWeight = cw2.nextWeight();
814                     if(!(prevSec < newWeight && newWeight < sec)) {
815                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
816                               (long)lowerLimit, (long)newWeight, (long)sec);
817                     }
818                 }
819             } else if(ter != prevTer) {
820                 uint32_t lowerLimit =
821                     prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
822                 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
823                     errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
824                 } else {
825                     uint32_t newWeight = cw3.nextWeight();
826                     if(!(prevTer < newWeight && newWeight < ter)) {
827                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
828                               (long)lowerLimit, (long)newWeight, (long)ter);
829                     }
830                 }
831             } else {
832                 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
833             }
834         }
835         prevPri = pri;
836         prevSec = sec;
837         prevTer = ter;
838     }
839 }
840 
TestTailoredElements()841 void CollationTest::TestTailoredElements() {
842     IcuTestErrorCode errorCode(*this, "TestTailoredElements");
843     const CollationData *root = CollationRoot::getData(errorCode);
844     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
845         return;
846     }
847     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
848 
849     UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
850     if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
851         return;
852     }
853     uhash_setKeyDeleter(prevLocales, uprv_free);
854     // TestRootElements() tests the root collator which does not have tailorings.
855     uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
856     uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
857     uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
858 
859     UVector64 ces(errorCode);
860     LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
861     U_ASSERT(locales.isValid());
862     const char *localeID = "root";
863     do {
864         Locale locale(localeID);
865         LocalPointer<StringEnumeration> types(
866                 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
867         errorCode.assertSuccess();
868         const char *type = NULL;  // default type
869         do {
870             Locale localeWithType(locale);
871             if(type != NULL) {
872                 localeWithType.setKeywordValue("collation", type, errorCode);
873             }
874             errorCode.assertSuccess();
875             LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
876             if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
877                                               localeWithType.getName())) {
878                 continue;
879             }
880             Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
881             if(uhash_geti(prevLocales, actual.getName()) != 0) {
882                 continue;
883             }
884             uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
885             errorCode.assertSuccess();
886             logln("TestTailoredElements(): requested %s -> actual %s",
887                   localeWithType.getName(), actual.getName());
888             RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
889             if(rbc == NULL) {
890                 continue;
891             }
892             // Note: It would be better to get tailored strings such that we can
893             // identify the prefix, and only get the CEs for the prefix+string,
894             // not also for the prefix.
895             // There is currently no API for that.
896             // It would help in an unusual case where a contraction starting in the prefix
897             // extends past its end, and we do not see the intended mapping.
898             // For example, for a mapping p|st, if there is also a contraction ps,
899             // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
900             LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
901             errorCode.assertSuccess();
902             UnicodeSetIterator iter(*tailored);
903             while(iter.next()) {
904                 const UnicodeString &s = iter.getString();
905                 ces.removeAllElements();
906                 rbc->internalGetCEs(s, ces, errorCode);
907                 errorCode.assertSuccess();
908                 for(int32_t i = 0; i < ces.size(); ++i) {
909                     int64_t ce = ces.elementAti(i);
910                     if(!isValidCE(rootElements, *root, ce)) {
911                         errln("invalid tailored CE %016llx at CE index %d from string:",
912                               (long long)ce, (int)i);
913                         infoln(prettify(s));
914                     }
915                 }
916             }
917         } while((type = types->next(NULL, errorCode)) != NULL);
918     } while((localeID = locales->next(NULL, errorCode)) != NULL);
919     uhash_close(prevLocales);
920 }
921 
printSortKey(const uint8_t * p,int32_t length)922 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
923     UnicodeString s;
924     for(int32_t i = 0; i < length; ++i) {
925         if(i > 0) { s.append((UChar)0x20); }
926         uint8_t b = p[i];
927         if(b == 0) {
928             s.append((UChar)0x2e);  // period
929         } else if(b == 1) {
930             s.append((UChar)0x7c);  // vertical bar
931         } else {
932             appendHex(b, 2, s);
933         }
934     }
935     return s;
936 }
937 
printCollationKey(const CollationKey & key)938 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
939     int32_t length;
940     const uint8_t *p = key.getByteArray(length);
941     return printSortKey(p, length);
942 }
943 
readLine(UCHARBUF * f,IcuTestErrorCode & errorCode)944 UBool CollationTest::readLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
945     int32_t lineLength;
946     const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
947     if(line == NULL || errorCode.isFailure()) {
948         fileLine.remove();
949         return FALSE;
950     }
951     ++fileLineNumber;
952     // Strip trailing CR/LF, comments, and spaces.
953     const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
954     if(comment != NULL) {
955         lineLength = (int32_t)(comment - line);
956     } else {
957         while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
958     }
959     while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
960     fileLine.setTo(FALSE, line, lineLength);
961     return TRUE;
962 }
963 
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)964 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
965                                 UErrorCode &errorCode) {
966     int32_t length = fileLine.length();
967     int32_t i;
968     for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
969     int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
970     if(pipeIndex >= 0) {
971         prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
972         if(prefix.isEmpty()) {
973             errln("empty prefix on line %d", (int)fileLineNumber);
974             infoln(fileLine);
975             errorCode = U_PARSE_ERROR;
976             return;
977         }
978         start = pipeIndex + 1;
979     } else {
980         prefix.remove();
981     }
982     s = fileLine.tempSubStringBetween(start, i).unescape();
983     if(s.isEmpty()) {
984         errln("empty string on line %d", (int)fileLineNumber);
985         infoln(fileLine);
986         errorCode = U_PARSE_ERROR;
987         return;
988     }
989     start = i;
990 }
991 
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)992 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
993     Collation::Level relation;
994     int32_t start;
995     if(fileLine[0] == 0x3c) {  // <
996         UChar second = fileLine[1];
997         start = 2;
998         switch(second) {
999         case 0x31:  // <1
1000             relation = Collation::PRIMARY_LEVEL;
1001             break;
1002         case 0x32:  // <2
1003             relation = Collation::SECONDARY_LEVEL;
1004             break;
1005         case 0x33:  // <3
1006             relation = Collation::TERTIARY_LEVEL;
1007             break;
1008         case 0x34:  // <4
1009             relation = Collation::QUATERNARY_LEVEL;
1010             break;
1011         case 0x63:  // <c
1012             relation = Collation::CASE_LEVEL;
1013             break;
1014         case 0x69:  // <i
1015             relation = Collation::IDENTICAL_LEVEL;
1016             break;
1017         default:  // just <
1018             relation = Collation::NO_LEVEL;
1019             start = 1;
1020             break;
1021         }
1022     } else if(fileLine[0] == 0x3d) {  // =
1023         relation = Collation::ZERO_LEVEL;
1024         start = 1;
1025     } else {
1026         start = 0;
1027     }
1028     if(start == 0 || !isSpace(fileLine[start])) {
1029         errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1030         infoln(fileLine);
1031         errorCode.set(U_PARSE_ERROR);
1032         return Collation::NO_LEVEL;
1033     }
1034     start = skipSpaces(start);
1035     UnicodeString prefix;
1036     parseString(start, prefix, s, errorCode);
1037     if(errorCode.isSuccess() && !prefix.isEmpty()) {
1038         errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1039         infoln(fileLine);
1040         errorCode.set(U_PARSE_ERROR);
1041         return Collation::NO_LEVEL;
1042     }
1043     if(start < fileLine.length()) {
1044         errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1045         infoln(fileLine);
1046         errorCode.set(U_PARSE_ERROR);
1047         return Collation::NO_LEVEL;
1048     }
1049     return relation;
1050 }
1051 
1052 static const struct {
1053     const char *name;
1054     UColAttribute attr;
1055 } attributes[] = {
1056     { "backwards", UCOL_FRENCH_COLLATION },
1057     { "alternate", UCOL_ALTERNATE_HANDLING },
1058     { "caseFirst", UCOL_CASE_FIRST },
1059     { "caseLevel", UCOL_CASE_LEVEL },
1060     // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1061     { "strength", UCOL_STRENGTH },
1062     // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1063     { "numeric", UCOL_NUMERIC_COLLATION }
1064 };
1065 
1066 static const struct {
1067     const char *name;
1068     UColAttributeValue value;
1069 } attributeValues[] = {
1070     { "default", UCOL_DEFAULT },
1071     { "primary", UCOL_PRIMARY },
1072     { "secondary", UCOL_SECONDARY },
1073     { "tertiary", UCOL_TERTIARY },
1074     { "quaternary", UCOL_QUATERNARY },
1075     { "identical", UCOL_IDENTICAL },
1076     { "off", UCOL_OFF },
1077     { "on", UCOL_ON },
1078     { "shifted", UCOL_SHIFTED },
1079     { "non-ignorable", UCOL_NON_IGNORABLE },
1080     { "lower", UCOL_LOWER_FIRST },
1081     { "upper", UCOL_UPPER_FIRST }
1082 };
1083 
parseAndSetAttribute(IcuTestErrorCode & errorCode)1084 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1085     int32_t start = skipSpaces(1);
1086     int32_t equalPos = fileLine.indexOf(0x3d);
1087     if(equalPos < 0) {
1088         if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1089             parseAndSetReorderCodes(start + 7, errorCode);
1090             return;
1091         }
1092         errln("missing '=' on line %d", (int)fileLineNumber);
1093         infoln(fileLine);
1094         errorCode.set(U_PARSE_ERROR);
1095         return;
1096     }
1097 
1098     UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1099     UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1100     if(attrString == UNICODE_STRING("maxVariable", 11)) {
1101         UColReorderCode max;
1102         if(valueString == UNICODE_STRING("space", 5)) {
1103             max = UCOL_REORDER_CODE_SPACE;
1104         } else if(valueString == UNICODE_STRING("punct", 5)) {
1105             max = UCOL_REORDER_CODE_PUNCTUATION;
1106         } else if(valueString == UNICODE_STRING("symbol", 6)) {
1107             max = UCOL_REORDER_CODE_SYMBOL;
1108         } else if(valueString == UNICODE_STRING("currency", 8)) {
1109             max = UCOL_REORDER_CODE_CURRENCY;
1110         } else {
1111             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1112             infoln(fileLine);
1113             errorCode.set(U_PARSE_ERROR);
1114             return;
1115         }
1116         coll->setMaxVariable(max, errorCode);
1117         if(errorCode.isFailure()) {
1118             errln("setMaxVariable() failed on line %d: %s",
1119                   (int)fileLineNumber, errorCode.errorName());
1120             infoln(fileLine);
1121             return;
1122         }
1123         fileLine.remove();
1124         return;
1125     }
1126 
1127     UColAttribute attr;
1128     for(int32_t i = 0;; ++i) {
1129         if(i == LENGTHOF(attributes)) {
1130             errln("invalid attribute name on line %d", (int)fileLineNumber);
1131             infoln(fileLine);
1132             errorCode.set(U_PARSE_ERROR);
1133             return;
1134         }
1135         if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1136             attr = attributes[i].attr;
1137             break;
1138         }
1139     }
1140 
1141     UColAttributeValue value;
1142     for(int32_t i = 0;; ++i) {
1143         if(i == LENGTHOF(attributeValues)) {
1144             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1145             infoln(fileLine);
1146             errorCode.set(U_PARSE_ERROR);
1147             return;
1148         }
1149         if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1150             value = attributeValues[i].value;
1151             break;
1152         }
1153     }
1154 
1155     coll->setAttribute(attr, value, errorCode);
1156     if(errorCode.isFailure()) {
1157         errln("illegal attribute=value combination on line %d: %s",
1158               (int)fileLineNumber, errorCode.errorName());
1159         infoln(fileLine);
1160         return;
1161     }
1162     fileLine.remove();
1163 }
1164 
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1165 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1166     UVector32 reorderCodes(errorCode);
1167     while(start < fileLine.length()) {
1168         start = skipSpaces(start);
1169         int32_t limit = start;
1170         while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1171         CharString name;
1172         name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1173         int32_t code = CollationRuleParser::getReorderCode(name.data());
1174         if(code < -1) {
1175             errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1176             infoln(fileLine);
1177             errorCode.set(U_PARSE_ERROR);
1178             return;
1179         }
1180         reorderCodes.addElement(code, errorCode);
1181         start = limit;
1182     }
1183     coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1184     if(errorCode.isFailure()) {
1185         errln("setReorderCodes() failed on line %d: %s", (int)fileLineNumber, errorCode.errorName());
1186         infoln(fileLine);
1187         return;
1188     }
1189     fileLine.remove();
1190 }
1191 
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1192 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1193     UnicodeString rules;
1194     while(readLine(f, errorCode)) {
1195         if(fileLine.isEmpty()) { continue; }
1196         if(isSectionStarter(fileLine[0])) { break; }
1197         rules.append(fileLine.unescape());
1198     }
1199     if(errorCode.isFailure()) { return; }
1200     logln(rules);
1201 
1202     UParseError parseError;
1203     UnicodeString reason;
1204     delete coll;
1205     coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1206     if(coll == NULL) {
1207         errln("unable to allocate a new collator");
1208         errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1209         return;
1210     }
1211     if(errorCode.isFailure()) {
1212         errln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1213         infoln(UnicodeString("  reason: ") + reason);
1214         if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1215         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1216             infoln(UnicodeString("  snippet: ...") +
1217                 parseError.preContext + "(!)" + parseError.postContext + "...");
1218         }
1219     } else {
1220         assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1221                      UnicodeString(), reason);
1222     }
1223 }
1224 
setRootCollator(IcuTestErrorCode & errorCode)1225 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1226     if(errorCode.isFailure()) { return; }
1227     delete coll;
1228     coll = Collator::createInstance(Locale::getRoot(), errorCode);
1229     if(errorCode.isFailure()) {
1230         dataerrln("unable to create a root collator");
1231         return;
1232     }
1233 }
1234 
setLocaleCollator(IcuTestErrorCode & errorCode)1235 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1236     if(errorCode.isFailure()) { return; }
1237     int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1238     if(at >= 0) {
1239         fileLine.setCharAt(at, (UChar)0x2a);  // *
1240     }
1241     CharString localeID;
1242     localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1243     if(at >= 0) {
1244         localeID.data()[at - 9] = '@';
1245     }
1246     Locale locale(localeID.data());
1247     if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1248         errln("invalid language tag on line %d", (int)fileLineNumber);
1249         infoln(fileLine);
1250         if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1251         return;
1252     }
1253 
1254     logln("creating a collator for locale ID %s", locale.getName());
1255     Collator *newColl = Collator::createInstance(locale, errorCode);
1256     if(errorCode.isFailure()) {
1257         dataerrln("unable to create a collator for locale %s on line %d",
1258                   locale.getName(), (int)fileLineNumber);
1259         infoln(fileLine);
1260         return;
1261     }
1262     delete coll;
1263     coll = newColl;
1264 }
1265 
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1266 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1267     if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1268     // In some sequences with Tibetan composite vowel signs,
1269     // even if the string passes the FCD check,
1270     // those composites must be decomposed.
1271     // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1272     int32_t index = 0;
1273     while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1274         if(++index < s.length()) {
1275             UChar c = s[index];
1276             if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1277         }
1278     }
1279     return FALSE;
1280 }
1281 
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1282 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1283                                      CharString &dest, int32_t partSize,
1284                                      IcuTestErrorCode &errorCode) {
1285     if(errorCode.isFailure()) { return FALSE; }
1286     uint8_t part[32];
1287     U_ASSERT(partSize <= LENGTHOF(part));
1288     UCharIterator iter;
1289     uiter_setString(&iter, s, length);
1290     uint32_t state[2] = { 0, 0 };
1291     for(;;) {
1292         int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1293         UBool done = partLength < partSize;
1294         if(done) {
1295             // At the end, append the next byte as well which should be 00.
1296             ++partLength;
1297         }
1298         dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1299         if(done) {
1300             return errorCode.isSuccess();
1301         }
1302     }
1303 }
1304 
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1305 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1306                                      const UChar *s, int32_t length,
1307                                      CollationKey &key, IcuTestErrorCode &errorCode) {
1308     if(errorCode.isFailure()) { return FALSE; }
1309     coll->getCollationKey(s, length, key, errorCode);
1310     if(errorCode.isFailure()) {
1311         infoln(fileTestName);
1312         errln("Collator(%s).getCollationKey() failed: %s",
1313               norm, errorCode.errorName());
1314         infoln(line);
1315         return FALSE;
1316     }
1317     int32_t keyLength;
1318     const uint8_t *keyBytes = key.getByteArray(keyLength);
1319     if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1320         infoln(fileTestName);
1321         errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1322               norm);
1323         infoln(line);
1324         infoln(printCollationKey(key));
1325         return FALSE;
1326     }
1327 
1328     int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1329     if(numLevels < UCOL_IDENTICAL) {
1330         ++numLevels;
1331     } else {
1332         numLevels = 5;
1333     }
1334     if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1335         ++numLevels;
1336     }
1337     errorCode.assertSuccess();
1338     int32_t numLevelSeparators = 0;
1339     for(int32_t i = 0; i < (keyLength - 1); ++i) {
1340         uint8_t b = keyBytes[i];
1341         if(b == 0) {
1342             infoln(fileTestName);
1343             errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1344             infoln(line);
1345             infoln(printCollationKey(key));
1346             return FALSE;
1347         }
1348         if(b == 1) { ++numLevelSeparators; }
1349     }
1350     if(numLevelSeparators != (numLevels - 1)) {
1351         infoln(fileTestName);
1352         errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1353               norm, (int)numLevelSeparators, (int)numLevels);
1354         infoln(line);
1355         infoln(printCollationKey(key));
1356         return FALSE;
1357     }
1358 
1359     // If s contains U+FFFE, check that merged segments make the same key.
1360     LocalMemory<uint8_t> mergedKey;
1361     int32_t mergedKeyLength = 0;
1362     int32_t mergedKeyCapacity = 0;
1363     int32_t sLength = (length >= 0) ? length : u_strlen(s);
1364     int32_t segmentStart = 0;
1365     for(int32_t i = 0;;) {
1366         if(i == sLength) {
1367             if(segmentStart == 0) {
1368                 // s does not contain any U+FFFE.
1369                 break;
1370             }
1371         } else if(s[i] != 0xfffe) {
1372             ++i;
1373             continue;
1374         }
1375         // Get the sort key for another segment and merge it into mergedKey.
1376         CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1377         CollationKey key2;
1378         coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1379         int32_t key1Length, key2Length;
1380         const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1381         const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1382         uint8_t *dest;
1383         int32_t minCapacity = key1Length + key2Length;
1384         if(key1Length > 0) { --minCapacity; }
1385         if(minCapacity <= mergedKeyCapacity) {
1386             dest = mergedKey.getAlias();
1387         } else {
1388             if(minCapacity <= 200) {
1389                 mergedKeyCapacity = 200;
1390             } else if(minCapacity <= 2 * mergedKeyCapacity) {
1391                 mergedKeyCapacity *= 2;
1392             } else {
1393                 mergedKeyCapacity = minCapacity;
1394             }
1395             dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1396         }
1397         U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1398         if(key1Length == 0) {
1399             // key2 is the sort key for the first segment.
1400             uprv_memcpy(dest, key2Bytes, key2Length);
1401             mergedKeyLength = key2Length;
1402         } else {
1403             mergedKeyLength =
1404                 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1405                                    dest, mergedKeyCapacity);
1406         }
1407         if(i == sLength) { break; }
1408         segmentStart = ++i;
1409     }
1410     if(segmentStart != 0 &&
1411             (mergedKeyLength != keyLength ||
1412             uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) {
1413         infoln(fileTestName);
1414         errln("Collator(%s).getCollationKey(with U+FFFE) != "
1415               "ucol_mergeSortkeys(segments)",
1416               norm);
1417         infoln(line);
1418         infoln(printCollationKey(key));
1419         infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength));
1420         return FALSE;
1421     }
1422 
1423     // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1424     static const int32_t partSizes[] = { 32, 3, 1 };
1425     for(int32_t psi = 0; psi < LENGTHOF(partSizes); ++psi) {
1426         int32_t partSize = partSizes[psi];
1427         CharString parts;
1428         if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1429             infoln(fileTestName);
1430             errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1431                   norm, (int)partSize, errorCode.errorName());
1432             infoln(line);
1433             return FALSE;
1434         }
1435         if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1436             infoln(fileTestName);
1437             errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1438                   norm, (int)partSize);
1439             infoln(line);
1440             infoln(printCollationKey(key));
1441             infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1442             return FALSE;
1443         }
1444     }
1445     return TRUE;
1446 }
1447 
1448 namespace {
1449 
1450 /**
1451  * Replaces unpaired surrogates with U+FFFD.
1452  * Returns s if no replacement was made, otherwise buffer.
1453  */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1454 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1455     int32_t i = 0;
1456     while(i < s.length()) {
1457         UChar32 c = s.char32At(i);
1458         if(U_IS_SURROGATE(c)) {
1459             if(buffer.length() < i) {
1460                 buffer.append(s, buffer.length(), i - buffer.length());
1461             }
1462             buffer.append((UChar)0xfffd);
1463         }
1464         i += U16_LENGTH(c);
1465     }
1466     if(buffer.isEmpty()) {
1467         return s;
1468     }
1469     if(buffer.length() < i) {
1470         buffer.append(s, buffer.length(), i - buffer.length());
1471     }
1472     return buffer;
1473 }
1474 
1475 }
1476 
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1477 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1478                                      const UnicodeString &prevString, const UnicodeString &s,
1479                                      UCollationResult expectedOrder, Collation::Level expectedLevel,
1480                                      IcuTestErrorCode &errorCode) {
1481     if(errorCode.isFailure()) { return FALSE; }
1482 
1483     // Get the sort keys first, for error debug output.
1484     CollationKey prevKey;
1485     if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1486                         prevKey, errorCode)) {
1487         return FALSE;
1488     }
1489     CollationKey key;
1490     if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1491 
1492     UCollationResult order = coll->compare(prevString, s, errorCode);
1493     if(order != expectedOrder || errorCode.isFailure()) {
1494         infoln(fileTestName);
1495         errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1496               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1497         infoln(prevFileLine);
1498         infoln(fileLine);
1499         infoln(printCollationKey(prevKey));
1500         infoln(printCollationKey(key));
1501         return FALSE;
1502     }
1503     order = coll->compare(s, prevString, errorCode);
1504     if(order != -expectedOrder || errorCode.isFailure()) {
1505         infoln(fileTestName);
1506         errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1507               (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1508         infoln(prevFileLine);
1509         infoln(fileLine);
1510         infoln(printCollationKey(prevKey));
1511         infoln(printCollationKey(key));
1512         return FALSE;
1513     }
1514     // Test NUL-termination if the strings do not contain NUL characters.
1515     UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1516     if(!containNUL) {
1517         order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1518         if(order != expectedOrder || errorCode.isFailure()) {
1519             infoln(fileTestName);
1520             errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1521                   (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1522             infoln(prevFileLine);
1523             infoln(fileLine);
1524             infoln(printCollationKey(prevKey));
1525             infoln(printCollationKey(key));
1526             return FALSE;
1527         }
1528         order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1529         if(order != -expectedOrder || errorCode.isFailure()) {
1530             infoln(fileTestName);
1531             errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1532                   (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1533             infoln(prevFileLine);
1534             infoln(fileLine);
1535             infoln(printCollationKey(prevKey));
1536             infoln(printCollationKey(key));
1537             return FALSE;
1538         }
1539     }
1540 
1541 #if U_HAVE_STD_STRING
1542     // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1543     // Unpaired surrogates cannot be converted to UTF-8.
1544     // Create valid UTF-16 strings if necessary, and use those for
1545     // both the expected compare() result and for the input to compare(UTF-8).
1546     UnicodeString prevBuffer, sBuffer;
1547     const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1548     const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1549     std::string prevUTF8, sUTF8;
1550     UnicodeString(prevValid).toUTF8String(prevUTF8);
1551     UnicodeString(sValid).toUTF8String(sUTF8);
1552     UCollationResult expectedUTF8Order;
1553     if(&prevValid == &prevString && &sValid == &s) {
1554         expectedUTF8Order = expectedOrder;
1555     } else {
1556         expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1557     }
1558 
1559     order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1560     if(order != expectedUTF8Order || errorCode.isFailure()) {
1561         infoln(fileTestName);
1562         errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1563               (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1564         infoln(prevFileLine);
1565         infoln(fileLine);
1566         infoln(printCollationKey(prevKey));
1567         infoln(printCollationKey(key));
1568         return FALSE;
1569     }
1570     order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1571     if(order != -expectedUTF8Order || errorCode.isFailure()) {
1572         infoln(fileTestName);
1573         errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1574               (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1575         infoln(prevFileLine);
1576         infoln(fileLine);
1577         infoln(printCollationKey(prevKey));
1578         infoln(printCollationKey(key));
1579         return FALSE;
1580     }
1581     // Test NUL-termination if the strings do not contain NUL characters.
1582     if(!containNUL) {
1583         order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1584         if(order != expectedUTF8Order || errorCode.isFailure()) {
1585             infoln(fileTestName);
1586             errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1587                   (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1588             infoln(prevFileLine);
1589             infoln(fileLine);
1590             infoln(printCollationKey(prevKey));
1591             infoln(printCollationKey(key));
1592             return FALSE;
1593         }
1594         order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1595         if(order != -expectedUTF8Order || errorCode.isFailure()) {
1596             infoln(fileTestName);
1597             errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1598                   (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1599             infoln(prevFileLine);
1600             infoln(fileLine);
1601             infoln(printCollationKey(prevKey));
1602             infoln(printCollationKey(key));
1603             return FALSE;
1604         }
1605     }
1606 #endif
1607 
1608     UCharIterator leftIter;
1609     UCharIterator rightIter;
1610     uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1611     uiter_setString(&rightIter, s.getBuffer(), s.length());
1612     order = coll->compare(leftIter, rightIter, errorCode);
1613     if(order != expectedOrder || errorCode.isFailure()) {
1614         infoln(fileTestName);
1615         errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1616               "wrong order: %d != %d (%s)",
1617               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1618         infoln(prevFileLine);
1619         infoln(fileLine);
1620         infoln(printCollationKey(prevKey));
1621         infoln(printCollationKey(key));
1622         return FALSE;
1623     }
1624 
1625     order = prevKey.compareTo(key, errorCode);
1626     if(order != expectedOrder || errorCode.isFailure()) {
1627         infoln(fileTestName);
1628         errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1629               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1630         infoln(prevFileLine);
1631         infoln(fileLine);
1632         infoln(printCollationKey(prevKey));
1633         infoln(printCollationKey(key));
1634         return FALSE;
1635     }
1636     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1637         int32_t prevKeyLength;
1638         const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1639         int32_t keyLength;
1640         const uint8_t *bytes = key.getByteArray(keyLength);
1641         int32_t level = Collation::PRIMARY_LEVEL;
1642         for(int32_t i = 0;; ++i) {
1643             uint8_t b = prevBytes[i];
1644             if(b != bytes[i]) { break; }
1645             if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1646                 ++level;
1647                 if(level == Collation::CASE_LEVEL &&
1648                         coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_OFF) {
1649                     ++level;
1650                 }
1651             }
1652         }
1653         if(level != expectedLevel) {
1654             infoln(fileTestName);
1655             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1656                   (int)fileLineNumber, norm, order, level, expectedLevel);
1657             infoln(prevFileLine);
1658             infoln(fileLine);
1659             infoln(printCollationKey(prevKey));
1660             infoln(printCollationKey(key));
1661             return FALSE;
1662         }
1663     }
1664     return TRUE;
1665 }
1666 
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1667 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1668     if(errorCode.isFailure()) { return; }
1669     UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1670     UnicodeString prevString, s;
1671     prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1672     while(readLine(f, errorCode)) {
1673         if(fileLine.isEmpty()) { continue; }
1674         if(isSectionStarter(fileLine[0])) { break; }
1675         Collation::Level relation = parseRelationAndString(s, errorCode);
1676         if(errorCode.isFailure()) {
1677             errorCode.reset();
1678             break;
1679         }
1680         UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1681         Collation::Level expectedLevel = relation;
1682         s.getTerminatedBuffer();  // Ensure NUL-termination.
1683         UBool isOk = TRUE;
1684         if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1685             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1686             isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1687                                    expectedOrder, expectedLevel, errorCode);
1688         }
1689         if(isOk) {
1690             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1691             isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1692                                    expectedOrder, expectedLevel, errorCode);
1693         }
1694         if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1695             UnicodeString pn = nfd->normalize(prevString, errorCode);
1696             UnicodeString n = nfd->normalize(s, errorCode);
1697             pn.getTerminatedBuffer();
1698             n.getTerminatedBuffer();
1699             errorCode.assertSuccess();
1700             isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1701                                    expectedOrder, expectedLevel, errorCode);
1702         }
1703         if(!isOk) {
1704             errorCode.reset();  // already reported
1705         }
1706         prevFileLine = fileLine;
1707         prevString = s;
1708         prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1709     }
1710 }
1711 
TestDataDriven()1712 void CollationTest::TestDataDriven() {
1713     IcuTestErrorCode errorCode(*this, "TestDataDriven");
1714 
1715     fcd = Normalizer2Factory::getFCDInstance(errorCode);
1716     nfd = Normalizer2Factory::getNFDInstance(errorCode);
1717     if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1718         return;
1719     }
1720 
1721     CharString path(getSourceTestData(errorCode), errorCode);
1722     path.appendPathPart("collationtest.txt", errorCode);
1723     const char *codePage = "UTF-8";
1724     LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1725     if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1726         return;
1727     }
1728     while(errorCode.isSuccess()) {
1729         // Read a new line if necessary.
1730         // Sub-parsers leave the first line set that they do not handle.
1731         if(fileLine.isEmpty()) {
1732             if(!readLine(f.getAlias(), errorCode)) { break; }
1733             continue;
1734         }
1735         if(!isSectionStarter(fileLine[0])) {
1736             errln("syntax error on line %d", (int)fileLineNumber);
1737             infoln(fileLine);
1738             return;
1739         }
1740         if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1741             fileTestName = fileLine;
1742             logln(fileLine);
1743             fileLine.remove();
1744         } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1745             setRootCollator(errorCode);
1746             fileLine.remove();
1747         } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1748             setLocaleCollator(errorCode);
1749             fileLine.remove();
1750         } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1751             buildTailoring(f.getAlias(), errorCode);
1752         } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1753             parseAndSetAttribute(errorCode);
1754         } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1755             checkCompareStrings(f.getAlias(), errorCode);
1756         } else {
1757             errln("syntax error on line %d", (int)fileLineNumber);
1758             infoln(fileLine);
1759             return;
1760         }
1761     }
1762 }
1763 
1764 #endif  // !UCONFIG_NO_COLLATION
1765