• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/stringpiece.h"
26 #include "unicode/tblcoll.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uniset.h"
29 #include "unicode/unistr.h"
30 #include "unicode/usetiter.h"
31 #include "unicode/ustring.h"
32 #include "charstr.h"
33 #include "cmemory.h"
34 #include "collation.h"
35 #include "collationdata.h"
36 #include "collationfcd.h"
37 #include "collationiterator.h"
38 #include "collationroot.h"
39 #include "collationrootelements.h"
40 #include "collationruleparser.h"
41 #include "collationweights.h"
42 #include "cstring.h"
43 #include "intltest.h"
44 #include "normalizer2impl.h"
45 #include "ucbuf.h"
46 #include "uhash.h"
47 #include "uitercollationiterator.h"
48 #include "utf16collationiterator.h"
49 #include "utf8collationiterator.h"
50 #include "uvectr32.h"
51 #include "uvectr64.h"
52 #include "writesrc.h"
53 
54 class CodePointIterator;
55 
56 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
57 
58 class CollationTest : public IntlTest {
59 public:
CollationTest()60     CollationTest()
61             : fcd(NULL), nfd(NULL),
62               fileLineNumber(0),
63               coll(NULL) {}
64 
~CollationTest()65     ~CollationTest() {
66         delete coll;
67     }
68 
69     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL) override;
70 
71     void TestMinMax();
72     void TestImplicits();
73     void TestNulTerminated();
74     void TestIllegalUTF8();
75     void TestShortFCDData();
76     void TestFCD();
77     void TestCollationWeights();
78     void TestRootElements();
79     void TestTailoredElements();
80     void TestDataDriven();
81     void TestLongLocale();
82     void TestBuilderContextsOverflow();
83 
84 private:
85     void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
86     void checkAllocWeights(CollationWeights &cw,
87                            uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
88                            int32_t someLength, int32_t minCount);
89 
90     static UnicodeString printSortKey(const uint8_t *p, int32_t length);
91     static UnicodeString printCollationKey(const CollationKey &key);
92 
93     // Helpers & fields for data-driven test.
isCROrLF(UChar c)94     static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)95     static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)96     static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
skipSpaces(int32_t i)97     int32_t skipSpaces(int32_t i) {
98         while(isSpace(fileLine[i])) { ++i; }
99         return i;
100     }
101 
102     UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
103     void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
104     Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
105     void parseAndSetAttribute(IcuTestErrorCode &errorCode);
106     void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
107     void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
108     void setRootCollator(IcuTestErrorCode &errorCode);
109     void setLocaleCollator(IcuTestErrorCode &errorCode);
110 
111     UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
112 
113     UBool getSortKeyParts(const UChar *s, int32_t length,
114                           CharString &dest, int32_t partSize,
115                           IcuTestErrorCode &errorCode);
116     UBool getCollationKey(const char *norm, const UnicodeString &line,
117                           const UChar *s, int32_t length,
118                           CollationKey &key, IcuTestErrorCode &errorCode);
119     UBool getMergedCollationKey(const UChar *s, int32_t length,
120                                 CollationKey &key, IcuTestErrorCode &errorCode);
121     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
122                           const UnicodeString &prevString, const UnicodeString &s,
123                           UCollationResult expectedOrder, Collation::Level expectedLevel,
124                           IcuTestErrorCode &errorCode);
125     void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
126 
127     const Normalizer2 *fcd, *nfd;
128     UnicodeString fileLine;
129     int32_t fileLineNumber;
130     UnicodeString fileTestName;
131     Collator *coll;
132 };
133 
createCollationTest()134 extern IntlTest *createCollationTest() {
135     return new CollationTest();
136 }
137 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)138 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
139     if(exec) {
140         logln("TestSuite CollationTest: ");
141     }
142     TESTCASE_AUTO_BEGIN;
143     TESTCASE_AUTO(TestMinMax);
144     TESTCASE_AUTO(TestImplicits);
145     TESTCASE_AUTO(TestNulTerminated);
146     TESTCASE_AUTO(TestIllegalUTF8);
147     TESTCASE_AUTO(TestShortFCDData);
148     TESTCASE_AUTO(TestFCD);
149     TESTCASE_AUTO(TestCollationWeights);
150     TESTCASE_AUTO(TestRootElements);
151     TESTCASE_AUTO(TestTailoredElements);
152     TESTCASE_AUTO(TestDataDriven);
153     TESTCASE_AUTO(TestLongLocale);
154     TESTCASE_AUTO(TestBuilderContextsOverflow);
155     TESTCASE_AUTO_END;
156 }
157 
TestMinMax()158 void CollationTest::TestMinMax() {
159     IcuTestErrorCode errorCode(*this, "TestMinMax");
160 
161     setRootCollator(errorCode);
162     if(errorCode.isFailure()) {
163         errorCode.reset();
164         return;
165     }
166     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
167     if(rbc == NULL) {
168         errln("the root collator is not a RuleBasedCollator");
169         return;
170     }
171 
172     static const UChar s[2] = { 0xfffe, 0xffff };
173     UVector64 ces(errorCode);
174     rbc->internalGetCEs(UnicodeString(false, s, 2), ces, errorCode);
175     errorCode.assertSuccess();
176     if(ces.size() != 2) {
177         errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
178         return;
179     }
180     int64_t ce = ces.elementAti(0);
181     int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
182     if(ce != expected) {
183         errln("CE(U+fffe)=%04lx != 02..", (long)ce);
184     }
185 
186     ce = ces.elementAti(1);
187     expected = Collation::makeCE(Collation::MAX_PRIMARY);
188     if(ce != expected) {
189         errln("CE(U+ffff)=%04lx != max..", (long)ce);
190     }
191 }
192 
TestImplicits()193 void CollationTest::TestImplicits() {
194     IcuTestErrorCode errorCode(*this, "TestImplicits");
195 
196     const CollationData *cd = CollationRoot::getData(errorCode);
197     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
198         return;
199     }
200 
201     // Implicit primary weights should be assigned for the following sets,
202     // and sort in ascending order by set and then code point.
203     // See http://www.unicode.org/reports/tr10/#Implicit_Weights
204 
205     // core Han Unified Ideographs
206     UnicodeSet coreHan("[\\p{unified_ideograph}&"
207                             "[\\p{Block=CJK_Unified_Ideographs}"
208                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
209                        errorCode);
210     // all other Unified Han ideographs
211     UnicodeSet otherHan("[\\p{unified ideograph}-"
212                             "[\\p{Block=CJK_Unified_Ideographs}"
213                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
214                         errorCode);
215     UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
216     unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
217 
218     // Starting with CLDR 26/ICU 54, the root Han order may instead be
219     // the Unihan radical-stroke order.
220     // The tests should pass either way, so we only test the order of a small set of Han characters
221     // whose radical-stroke order is the same as their code point order.
222     UnicodeSet someHanInCPOrder(
223             "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
224             "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
225             errorCode);
226     UnicodeSet inOrder(someHanInCPOrder);
227     inOrder.addAll(unassigned).freeze();
228     if(errorCode.errIfFailureAndReset("UnicodeSet")) {
229         return;
230     }
231     const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
232     UChar32 prev = 0;
233     uint32_t prevPrimary = 0;
234     UTF16CollationIterator ci(cd, false, NULL, NULL, NULL);
235     for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
236         LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
237         while(iter->next()) {
238             UChar32 c = iter->getCodepoint();
239             UnicodeString s(c);
240             ci.setText(s.getBuffer(), s.getBuffer() + s.length());
241             int64_t ce = ci.nextCE(errorCode);
242             int64_t ce2 = ci.nextCE(errorCode);
243             if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
244                 return;
245             }
246             if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
247                 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
248                 continue;
249             }
250             if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
251                 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
252                       (long)c, (long)(ce & 0xffffffff));
253                 continue;
254             }
255             uint32_t primary = (uint32_t)(ce >> 32);
256             if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
257                 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
258                       (long)c, (long)primary, (long)prev, (long)prevPrimary);
259             }
260             prev = c;
261             prevPrimary = primary;
262         }
263     }
264 }
265 
TestNulTerminated()266 void CollationTest::TestNulTerminated() {
267     IcuTestErrorCode errorCode(*this, "TestNulTerminated");
268     const CollationData *data = CollationRoot::getData(errorCode);
269     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
270         return;
271     }
272 
273     static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
274 
275     UTF16CollationIterator ci1(data, false, s, s, s + 2);
276     UTF16CollationIterator ci2(data, false, s + 2, s + 2, NULL);
277     for(int32_t i = 0;; ++i) {
278         int64_t ce1 = ci1.nextCE(errorCode);
279         int64_t ce2 = ci2.nextCE(errorCode);
280         if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
281             return;
282         }
283         if(ce1 != ce2) {
284             errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
285             break;
286         }
287         if(ce1 == Collation::NO_CE) { break; }
288     }
289 }
290 
TestIllegalUTF8()291 void CollationTest::TestIllegalUTF8() {
292     IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
293 
294     setRootCollator(errorCode);
295     if(errorCode.isFailure()) {
296         errorCode.reset();
297         return;
298     }
299     coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
300 
301     static const StringPiece strings[] = {
302         // string with U+FFFD == illegal byte sequence
303         u8"a\uFFFDz",                   "a\x80z",  // trail byte
304         u8"a\uFFFD\uFFFDz",             "a\xc1\x81z",  // non-shortest form
305         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xe0\x82\x83z",  // non-shortest form
306         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
307         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
308         u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz",  // non-shortest form
309         u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
310     };
311 
312     for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
313         StringPiece fffd(strings[i]);
314         StringPiece illegal(strings[i + 1]);
315         UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
316         if(order != UCOL_EQUAL) {
317             errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
318                   (int)i, order);
319         }
320     }
321 }
322 
323 namespace {
324 
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)325 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
326     for(UChar32 c = 0x10000; c < 0x110000;) {
327         UChar32 next = c + 0x400;
328         if(src.containsSome(c, next - 1)) {
329             dest.add(U16_LEAD(c));
330         }
331         c = next;
332     }
333 }
334 
335 }  // namespace
336 
TestShortFCDData()337 void CollationTest::TestShortFCDData() {
338     // See CollationFCD class comments.
339     IcuTestErrorCode errorCode(*this, "TestShortFCDData");
340     UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
341     errorCode.assertSuccess();
342     expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
343     addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
344     UnicodeSet lccc;  // actual
345     for(UChar32 c = 0; c <= 0xffff; ++c) {
346         if(CollationFCD::hasLccc(c)) { lccc.add(c); }
347     }
348     UnicodeSet diff(expectedLccc);
349     diff.removeAll(lccc);
350     diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
351     UnicodeString empty("[]");
352     UnicodeString diffString;
353     diff.toPattern(diffString, true);
354     assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
355     diff = lccc;
356     diff.removeAll(expectedLccc);
357     diff.toPattern(diffString, true);
358     assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, true);
359 
360     UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
361     if (errorCode.isSuccess()) {
362         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
363         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
364         UnicodeSet tccc;  // actual
365         for(UChar32 c = 0; c <= 0xffff; ++c) {
366             if(CollationFCD::hasTccc(c)) { tccc.add(c); }
367         }
368         diff = expectedTccc;
369         diff.removeAll(tccc);
370         diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
371         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
372         diff = tccc;
373         diff.removeAll(expectedTccc);
374         diff.toPattern(diffString, true);
375         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
376     }
377 }
378 
379 class CodePointIterator {
380 public:
CodePointIterator(const UChar32 * cp,int32_t length)381     CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()382     void resetToStart() { pos = 0; }
next()383     UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()384     UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const385     int32_t getLength() const { return length; }
getIndex() const386     int getIndex() const { return (int)pos; }
387 private:
388     const UChar32 *cp;
389     int32_t length;
390     int32_t pos;
391 };
392 
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)393 void CollationTest::checkFCD(const char *name,
394                              CollationIterator &ci, CodePointIterator &cpi) {
395     IcuTestErrorCode errorCode(*this, "checkFCD");
396 
397     // Iterate forward to the limit.
398     for(;;) {
399         UChar32 c1 = ci.nextCodePoint(errorCode);
400         UChar32 c2 = cpi.next();
401         if(c1 != c2) {
402             errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
403                   name, (long)c1, (long)c2, cpi.getIndex());
404             return;
405         }
406         if(c1 < 0) { break; }
407     }
408 
409     // Iterate backward most of the way.
410     for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
411         UChar32 c1 = ci.previousCodePoint(errorCode);
412         UChar32 c2 = cpi.previous();
413         if(c1 != c2) {
414             errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
415                   name, (long)c1, (long)c2, cpi.getIndex());
416             return;
417         }
418     }
419 
420     // Forward again.
421     for(;;) {
422         UChar32 c1 = ci.nextCodePoint(errorCode);
423         UChar32 c2 = cpi.next();
424         if(c1 != c2) {
425             errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
426                   name, (long)c1, (long)c2, cpi.getIndex());
427             return;
428         }
429         if(c1 < 0) { break; }
430     }
431 
432     // Iterate backward to the start.
433     for(;;) {
434         UChar32 c1 = ci.previousCodePoint(errorCode);
435         UChar32 c2 = cpi.previous();
436         if(c1 != c2) {
437             errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
438                   name, (long)c1, (long)c2, cpi.getIndex());
439             return;
440         }
441         if(c1 < 0) { break; }
442     }
443 }
444 
TestFCD()445 void CollationTest::TestFCD() {
446     IcuTestErrorCode errorCode(*this, "TestFCD");
447     const CollationData *data = CollationRoot::getData(errorCode);
448     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
449         return;
450     }
451 
452     // Input string, not FCD, NUL-terminated.
453     static const UChar s[] = {
454         0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
455         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
456         0x327, 0x308,  // ccc=202, 230
457         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
458         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
459         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
460         0xac01,
461         0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
462         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
463         0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
464         0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
465         0x4e00, 0xf81,
466         0
467     };
468     // Expected code points.
469     static const UChar32 cp[] = {
470         0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
471         0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
472         0x1D15F, 0x1D16D,
473         0xac01,
474         0x63, 0x327, 0x1D165, 0x1D16D,
475         0x61,
476         0xf71, 0xf71, 0xf72, 0xf74, 0x301,
477         0x4e00, 0xf71, 0xf80
478     };
479 
480     FCDUTF16CollationIterator u16ci(data, false, s, s, NULL);
481     if(errorCode.errIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
482         return;
483     }
484     CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
485     checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
486 
487     cpi.resetToStart();
488     std::string utf8;
489     UnicodeString(s).toUTF8String(utf8);
490     FCDUTF8CollationIterator u8ci(data, false,
491                                   reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
492     if(errorCode.errIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
493         return;
494     }
495     checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
496 
497     cpi.resetToStart();
498     UCharIterator iter;
499     uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL
500     FCDUIterCollationIterator uici(data, false, iter, 0);
501     if(errorCode.errIfFailureAndReset("FCDUIterCollationIterator constructor")) {
502         return;
503     }
504     checkFCD("FCDUIterCollationIterator", uici, cpi);
505 }
506 
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)507 void CollationTest::checkAllocWeights(CollationWeights &cw,
508                                       uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
509                                       int32_t someLength, int32_t minCount) {
510     if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
511         errln("CollationWeights::allocWeights(%lx, %lx, %ld) = false",
512               (long)lowerLimit, (long)upperLimit, (long)n);
513         return;
514     }
515     uint32_t previous = lowerLimit;
516     int32_t count = 0;  // number of weights that have someLength
517     for(int32_t i = 0; i < n; ++i) {
518         uint32_t w = cw.nextWeight();
519         if(w == 0xffffffff) {
520             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
521                   "returns only %ld weights",
522                   (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
523             return;
524         }
525         if(!(previous < w && w < upperLimit)) {
526             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
527                   "number %ld -> %lx not between %lx and %lx",
528                   (long)lowerLimit, (long)upperLimit, (long)n,
529                   (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
530             return;
531         }
532         if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
533     }
534     if(count < minCount) {
535         errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
536               "returns only %ld < %ld weights of length %d",
537               (long)lowerLimit, (long)upperLimit, (long)n,
538               (long)count, (long)minCount, (int)someLength);
539     }
540 }
541 
TestCollationWeights()542 void CollationTest::TestCollationWeights() {
543     CollationWeights cw;
544 
545     // Non-compressible primaries use 254 second bytes 02..FF.
546     logln("CollationWeights.initForPrimary(non-compressible)");
547     cw.initForPrimary(false);
548     // Expect 1 weight 11 and 254 weights 12xx.
549     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
550     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
551     // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
552     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
553     // Expect 254 two-byte weights from the ranges 10ff and 11xx.
554     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
555     // Expect 254^2=64516 three-byte weights.
556     // During computation, there should be 3 three-byte ranges
557     // 10ffff, 11xxxx, 120202.
558     // The middle one should be split 64515:1,
559     // and the newly-split-off range and the last ranged lengthened.
560     checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
561     // Expect weights 1102 & 1103.
562     checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
563     // Expect weights 102102 & 102103.
564     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
565 
566     // Compressible primaries use 251 second bytes 04..FE.
567     logln("CollationWeights.initForPrimary(compressible)");
568     cw.initForPrimary(true);
569     // Expect 1 weight 11 and 251 weights 12xx.
570     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
571     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
572     // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
573     checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
574     // Expect weights 1104 & 1105.
575     checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
576     // Expect weights 102102 & 102103.
577     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
578 
579     // Secondary and tertiary weights use only bytes 3 & 4.
580     logln("CollationWeights.initForSecondary()");
581     cw.initForSecondary();
582     // Expect weights fbxx and all four fc..ff.
583     checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
584 
585     logln("CollationWeights.initForTertiary()");
586     cw.initForTertiary();
587     // Expect weights 3dxx and both 3e & 3f.
588     checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
589 }
590 
591 namespace {
592 
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)593 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
594                 uint32_t p, uint32_t s, uint32_t ctq) {
595     uint32_t p1 = p >> 24;
596     uint32_t p2 = (p >> 16) & 0xff;
597     uint32_t p3 = (p >> 8) & 0xff;
598     uint32_t p4 = p & 0xff;
599     uint32_t s1 = s >> 8;
600     uint32_t s2 = s & 0xff;
601     // ctq = Case, Tertiary, Quaternary
602     uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
603     uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
604     uint32_t t1 = t >> 8;
605     uint32_t t2 = t & 0xff;
606     uint32_t q = ctq & Collation::QUATERNARY_MASK;
607     // No leading zero bytes.
608     if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
609         return false;
610     }
611     // No intermediate zero bytes.
612     if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
613         return false;
614     }
615     if(p2 != 0 && p3 == 0 && p4 != 0) {
616         return false;
617     }
618     // Minimum & maximum lead bytes.
619     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
620             s1 == Collation::LEVEL_SEPARATOR_BYTE ||
621             t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
622         return false;
623     }
624     if(c > 2) {
625         return false;
626     }
627     // The valid byte range for the second primary byte depends on compressibility.
628     if(p2 != 0) {
629         if(data.isCompressibleLeadByte(p1)) {
630             if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
631                     Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
632                 return false;
633             }
634         } else {
635             if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
636                 return false;
637             }
638         }
639     }
640     // Other bytes just need to avoid the level separator.
641     // Trailing zeros are ok.
642     U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
643     if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
644             s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
645         return false;
646     }
647     // Well-formed CEs.
648     if(p == 0) {
649         if(s == 0) {
650             if(t == 0) {
651                 // Completely ignorable CE.
652                 // Quaternary CEs are not supported.
653                 if(c != 0 || q != 0) {
654                     return false;
655                 }
656             } else {
657                 // Tertiary CE.
658                 if(t < re.getTertiaryBoundary() || c != 2) {
659                     return false;
660                 }
661             }
662         } else {
663             // Secondary CE.
664             if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
665                 return false;
666             }
667         }
668     } else {
669         // Primary CE.
670         if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
671                 s >= re.getSecondaryBoundary()) {
672             return false;
673         }
674         if(t == 0 || t >= re.getTertiaryBoundary()) {
675             return false;
676         }
677     }
678     return true;
679 }
680 
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)681 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
682     uint32_t p = (uint32_t)(ce >> 32);
683     uint32_t secTer = (uint32_t)ce;
684     return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
685 }
686 
687 class RootElementsIterator {
688 public:
RootElementsIterator(const CollationData & root)689     RootElementsIterator(const CollationData &root)
690             : data(root),
691               elements(root.rootElements), length(root.rootElementsLength),
692               pri(0), secTer(0),
693               index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
694 
next()695     UBool next() {
696         if(index >= length) { return false; }
697         uint32_t p = elements[index];
698         if(p == CollationRootElements::PRIMARY_SENTINEL) { return false; }
699         if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
700             ++index;
701             secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
702             return true;
703         }
704         if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
705             // End of a range, enumerate the primaries in the range.
706             int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
707             p &= 0xffffff00;
708             if(pri == p) {
709                 // Finished the range, return the next CE after it.
710                 ++index;
711                 return next();
712             }
713             U_ASSERT(pri < p);
714             // Return the next primary in this range.
715             UBool isCompressible = data.isCompressiblePrimary(pri);
716             if((pri & 0xffff) == 0) {
717                 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
718             } else {
719                 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
720             }
721             return true;
722         }
723         // Simple primary CE.
724         ++index;
725         pri = p;
726         // Does this have an explicit below-common sec/ter unit,
727         // or does it imply a common one?
728         if(index == length) {
729             secTer = Collation::COMMON_SEC_AND_TER_CE;
730         } else {
731             secTer = elements[index];
732             if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
733                 // No sec/ter delta.
734                 secTer = Collation::COMMON_SEC_AND_TER_CE;
735             } else {
736                 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
737                 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
738                     // Implied sec/ter.
739                     secTer = Collation::COMMON_SEC_AND_TER_CE;
740                 } else {
741                     // Explicit sec/ter below common/common.
742                     ++index;
743                 }
744             }
745         }
746         return true;
747     }
748 
getPrimary() const749     uint32_t getPrimary() const { return pri; }
getSecTer() const750     uint32_t getSecTer() const { return secTer; }
751 
752 private:
753     const CollationData &data;
754     const uint32_t *elements;
755     int32_t length;
756 
757     uint32_t pri;
758     uint32_t secTer;
759     int32_t index;
760 };
761 
762 }  // namespace
763 
TestRootElements()764 void CollationTest::TestRootElements() {
765     IcuTestErrorCode errorCode(*this, "TestRootElements");
766     const CollationData *root = CollationRoot::getData(errorCode);
767     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
768         return;
769     }
770     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
771     RootElementsIterator iter(*root);
772 
773     // We check each root CE for validity,
774     // and we also verify that there is a tailoring gap between each two CEs.
775     CollationWeights cw1c;  // compressible primary weights
776     CollationWeights cw1u;  // uncompressible primary weights
777     CollationWeights cw2;
778     CollationWeights cw3;
779 
780     cw1c.initForPrimary(true);
781     cw1u.initForPrimary(false);
782     cw2.initForSecondary();
783     cw3.initForTertiary();
784 
785     // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
786     // nor the special merge-separator CE for U+FFFE.
787     uint32_t prevPri = 0;
788     uint32_t prevSec = 0;
789     uint32_t prevTer = 0;
790     while(iter.next()) {
791         uint32_t pri = iter.getPrimary();
792         uint32_t secTer = iter.getSecTer();
793         // CollationRootElements CEs must have 0 case and quaternary bits.
794         if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
795             errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
796                   (long)pri, (long)secTer);
797         }
798         uint32_t sec = secTer >> 16;
799         uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
800         uint32_t ctq = ter;
801         if(pri == 0 && sec == 0 && ter != 0) {
802             // Tertiary CEs must have uppercase bits,
803             // but they are not stored in the CollationRootElements.
804             ctq |= 0x8000;
805         }
806         if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
807             errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
808         } else {
809             if(pri != prevPri) {
810                 uint32_t newWeight = 0;
811                 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
812                     // There is currently no tailoring gap after primary ignorables,
813                     // and we forbid tailoring after U+FFFD and U+FFFF.
814                 } else if(root->isCompressiblePrimary(prevPri)) {
815                     if(!cw1c.allocWeights(prevPri, pri, 1)) {
816                         errln("no primary/compressible tailoring gap between %08lx and %08lx",
817                               (long)prevPri, (long)pri);
818                     } else {
819                         newWeight = cw1c.nextWeight();
820                     }
821                 } else {
822                     if(!cw1u.allocWeights(prevPri, pri, 1)) {
823                         errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
824                               (long)prevPri, (long)pri);
825                     } else {
826                         newWeight = cw1u.nextWeight();
827                     }
828                 }
829                 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
830                     errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
831                           (long)prevPri, (long)newWeight, (long)pri);
832                 }
833             } else if(sec != prevSec) {
834                 uint32_t lowerLimit =
835                     prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
836                 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
837                     errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
838                 } else {
839                     uint32_t newWeight = cw2.nextWeight();
840                     if(!(prevSec < newWeight && newWeight < sec)) {
841                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
842                               (long)lowerLimit, (long)newWeight, (long)sec);
843                     }
844                 }
845             } else if(ter != prevTer) {
846                 uint32_t lowerLimit =
847                     prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
848                 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
849                     errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
850                 } else {
851                     uint32_t newWeight = cw3.nextWeight();
852                     if(!(prevTer < newWeight && newWeight < ter)) {
853                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
854                               (long)lowerLimit, (long)newWeight, (long)ter);
855                     }
856                 }
857             } else {
858                 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
859             }
860         }
861         prevPri = pri;
862         prevSec = sec;
863         prevTer = ter;
864     }
865 }
866 
TestTailoredElements()867 void CollationTest::TestTailoredElements() {
868     IcuTestErrorCode errorCode(*this, "TestTailoredElements");
869     const CollationData *root = CollationRoot::getData(errorCode);
870     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
871         return;
872     }
873     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
874 
875     UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
876     if(errorCode.errIfFailureAndReset("failed to create a hash table")) {
877         return;
878     }
879     uhash_setKeyDeleter(prevLocales, uprv_free);
880     // TestRootElements() tests the root collator which does not have tailorings.
881     uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
882     uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
883     uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
884 
885     UVector64 ces(errorCode);
886     LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
887     U_ASSERT(locales.isValid());
888     const char *localeID = "root";
889     do {
890         Locale locale(localeID);
891         LocalPointer<StringEnumeration> types(
892                 Collator::getKeywordValuesForLocale("collation", locale, false, errorCode));
893         errorCode.assertSuccess();
894         const char *type;  // first: default type
895         while((type = types->next(NULL, errorCode)) != NULL) {
896             if(strncmp(type, "private-", 8) == 0) {
897                 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
898                         localeID, type);
899             }
900             Locale localeWithType(locale);
901             localeWithType.setKeywordValue("collation", type, errorCode);
902             errorCode.assertSuccess();
903             LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
904             if(errorCode.errIfFailureAndReset("Collator::createInstance(%s)",
905                                               localeWithType.getName())) {
906                 continue;
907             }
908             Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
909             if(uhash_geti(prevLocales, actual.getName()) != 0) {
910                 continue;
911             }
912             uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
913             errorCode.assertSuccess();
914             logln("TestTailoredElements(): requested %s -> actual %s",
915                   localeWithType.getName(), actual.getName());
916             RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
917             if(rbc == NULL) {
918                 continue;
919             }
920             // Note: It would be better to get tailored strings such that we can
921             // identify the prefix, and only get the CEs for the prefix+string,
922             // not also for the prefix.
923             // There is currently no API for that.
924             // It would help in an unusual case where a contraction starting in the prefix
925             // extends past its end, and we do not see the intended mapping.
926             // For example, for a mapping p|st, if there is also a contraction ps,
927             // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
928             LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
929             errorCode.assertSuccess();
930             UnicodeSetIterator iter(*tailored);
931             while(iter.next()) {
932                 const UnicodeString &s = iter.getString();
933                 ces.removeAllElements();
934                 rbc->internalGetCEs(s, ces, errorCode);
935                 errorCode.assertSuccess();
936                 for(int32_t i = 0; i < ces.size(); ++i) {
937                     int64_t ce = ces.elementAti(i);
938                     if(!isValidCE(rootElements, *root, ce)) {
939                         errln("invalid tailored CE %016llx at CE index %d from string:",
940                               (long long)ce, (int)i);
941                         infoln(prettify(s));
942                     }
943                 }
944             }
945         }
946     } while((localeID = locales->next(NULL, errorCode)) != NULL);
947     uhash_close(prevLocales);
948 }
949 
printSortKey(const uint8_t * p,int32_t length)950 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
951     UnicodeString s;
952     for(int32_t i = 0; i < length; ++i) {
953         if(i > 0) { s.append((UChar)0x20); }
954         uint8_t b = p[i];
955         if(b == 0) {
956             s.append((UChar)0x2e);  // period
957         } else if(b == 1) {
958             s.append((UChar)0x7c);  // vertical bar
959         } else {
960             appendHex(b, 2, s);
961         }
962     }
963     return s;
964 }
965 
printCollationKey(const CollationKey & key)966 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
967     int32_t length;
968     const uint8_t *p = key.getByteArray(length);
969     return printSortKey(p, length);
970 }
971 
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)972 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
973     for(;;) {
974         int32_t lineLength;
975         const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
976         if(line == NULL || errorCode.isFailure()) {
977             fileLine.remove();
978             return false;
979         }
980         ++fileLineNumber;
981         // Strip trailing CR/LF, comments, and spaces.
982         const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
983         if(comment != NULL) {
984             lineLength = (int32_t)(comment - line);
985         } else {
986             while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
987         }
988         while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
989         if(lineLength != 0) {
990             fileLine.setTo(false, line, lineLength);
991             return true;
992         }
993         // Empty line, continue.
994     }
995 }
996 
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)997 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
998                                 UErrorCode &errorCode) {
999     int32_t length = fileLine.length();
1000     int32_t i;
1001     for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
1002     int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
1003     if(pipeIndex >= 0) {
1004         prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1005         if(prefix.isEmpty()) {
1006             errln("empty prefix on line %d", (int)fileLineNumber);
1007             infoln(fileLine);
1008             errorCode = U_PARSE_ERROR;
1009             return;
1010         }
1011         start = pipeIndex + 1;
1012     } else {
1013         prefix.remove();
1014     }
1015     s = fileLine.tempSubStringBetween(start, i).unescape();
1016     if(s.isEmpty()) {
1017         errln("empty string on line %d", (int)fileLineNumber);
1018         infoln(fileLine);
1019         errorCode = U_PARSE_ERROR;
1020         return;
1021     }
1022     start = i;
1023 }
1024 
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1025 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1026     Collation::Level relation;
1027     int32_t start;
1028     if(fileLine[0] == 0x3c) {  // <
1029         UChar second = fileLine[1];
1030         start = 2;
1031         switch(second) {
1032         case 0x31:  // <1
1033             relation = Collation::PRIMARY_LEVEL;
1034             break;
1035         case 0x32:  // <2
1036             relation = Collation::SECONDARY_LEVEL;
1037             break;
1038         case 0x33:  // <3
1039             relation = Collation::TERTIARY_LEVEL;
1040             break;
1041         case 0x34:  // <4
1042             relation = Collation::QUATERNARY_LEVEL;
1043             break;
1044         case 0x63:  // <c
1045             relation = Collation::CASE_LEVEL;
1046             break;
1047         case 0x69:  // <i
1048             relation = Collation::IDENTICAL_LEVEL;
1049             break;
1050         default:  // just <
1051             relation = Collation::NO_LEVEL;
1052             start = 1;
1053             break;
1054         }
1055     } else if(fileLine[0] == 0x3d) {  // =
1056         relation = Collation::ZERO_LEVEL;
1057         start = 1;
1058     } else {
1059         start = 0;
1060     }
1061     if(start == 0 || !isSpace(fileLine[start])) {
1062         errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1063         infoln(fileLine);
1064         errorCode.set(U_PARSE_ERROR);
1065         return Collation::NO_LEVEL;
1066     }
1067     start = skipSpaces(start);
1068     UnicodeString prefix;
1069     parseString(start, prefix, s, errorCode);
1070     if(errorCode.isSuccess() && !prefix.isEmpty()) {
1071         errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1072         infoln(fileLine);
1073         errorCode.set(U_PARSE_ERROR);
1074         return Collation::NO_LEVEL;
1075     }
1076     if(start < fileLine.length()) {
1077         errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1078         infoln(fileLine);
1079         errorCode.set(U_PARSE_ERROR);
1080         return Collation::NO_LEVEL;
1081     }
1082     return relation;
1083 }
1084 
1085 static const struct {
1086     const char *name;
1087     UColAttribute attr;
1088 } attributes[] = {
1089     { "backwards", UCOL_FRENCH_COLLATION },
1090     { "alternate", UCOL_ALTERNATE_HANDLING },
1091     { "caseFirst", UCOL_CASE_FIRST },
1092     { "caseLevel", UCOL_CASE_LEVEL },
1093     // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1094     { "strength", UCOL_STRENGTH },
1095     // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1096     { "numeric", UCOL_NUMERIC_COLLATION }
1097 };
1098 
1099 static const struct {
1100     const char *name;
1101     UColAttributeValue value;
1102 } attributeValues[] = {
1103     { "default", UCOL_DEFAULT },
1104     { "primary", UCOL_PRIMARY },
1105     { "secondary", UCOL_SECONDARY },
1106     { "tertiary", UCOL_TERTIARY },
1107     { "quaternary", UCOL_QUATERNARY },
1108     { "identical", UCOL_IDENTICAL },
1109     { "off", UCOL_OFF },
1110     { "on", UCOL_ON },
1111     { "shifted", UCOL_SHIFTED },
1112     { "non-ignorable", UCOL_NON_IGNORABLE },
1113     { "lower", UCOL_LOWER_FIRST },
1114     { "upper", UCOL_UPPER_FIRST }
1115 };
1116 
parseAndSetAttribute(IcuTestErrorCode & errorCode)1117 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1118     // Parse attributes even if the Collator could not be created,
1119     // in order to report syntax errors.
1120     int32_t start = skipSpaces(1);
1121     int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1122     if(equalPos < 0) {
1123         if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1124             parseAndSetReorderCodes(start + 7, errorCode);
1125             return;
1126         }
1127         errln("missing '=' on line %d", (int)fileLineNumber);
1128         infoln(fileLine);
1129         errorCode.set(U_PARSE_ERROR);
1130         return;
1131     }
1132 
1133     UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1134     UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1135     if(attrString == UNICODE_STRING("maxVariable", 11)) {
1136         UColReorderCode max;
1137         if(valueString == UNICODE_STRING("space", 5)) {
1138             max = UCOL_REORDER_CODE_SPACE;
1139         } else if(valueString == UNICODE_STRING("punct", 5)) {
1140             max = UCOL_REORDER_CODE_PUNCTUATION;
1141         } else if(valueString == UNICODE_STRING("symbol", 6)) {
1142             max = UCOL_REORDER_CODE_SYMBOL;
1143         } else if(valueString == UNICODE_STRING("currency", 8)) {
1144             max = UCOL_REORDER_CODE_CURRENCY;
1145         } else {
1146             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1147             infoln(fileLine);
1148             errorCode.set(U_PARSE_ERROR);
1149             return;
1150         }
1151         if(coll != NULL) {
1152             coll->setMaxVariable(max, errorCode);
1153             if(errorCode.isFailure()) {
1154                 errln("setMaxVariable() failed on line %d: %s",
1155                       (int)fileLineNumber, errorCode.errorName());
1156                 infoln(fileLine);
1157                 return;
1158             }
1159         }
1160         fileLine.remove();
1161         return;
1162     }
1163 
1164     UColAttribute attr;
1165     for(int32_t i = 0;; ++i) {
1166         if(i == UPRV_LENGTHOF(attributes)) {
1167             errln("invalid attribute name on line %d", (int)fileLineNumber);
1168             infoln(fileLine);
1169             errorCode.set(U_PARSE_ERROR);
1170             return;
1171         }
1172         if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1173             attr = attributes[i].attr;
1174             break;
1175         }
1176     }
1177 
1178     UColAttributeValue value;
1179     for(int32_t i = 0;; ++i) {
1180         if(i == UPRV_LENGTHOF(attributeValues)) {
1181             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1182             infoln(fileLine);
1183             errorCode.set(U_PARSE_ERROR);
1184             return;
1185         }
1186         if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1187             value = attributeValues[i].value;
1188             break;
1189         }
1190     }
1191 
1192     if(coll != NULL) {
1193         coll->setAttribute(attr, value, errorCode);
1194         if(errorCode.isFailure()) {
1195             errln("illegal attribute=value combination on line %d: %s",
1196                   (int)fileLineNumber, errorCode.errorName());
1197             infoln(fileLine);
1198             return;
1199         }
1200     }
1201     fileLine.remove();
1202 }
1203 
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1204 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1205     UVector32 reorderCodes(errorCode);
1206     while(start < fileLine.length()) {
1207         start = skipSpaces(start);
1208         int32_t limit = start;
1209         while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1210         CharString name;
1211         name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1212         int32_t code = CollationRuleParser::getReorderCode(name.data());
1213         if(code < 0) {
1214             if(uprv_stricmp(name.data(), "default") == 0) {
1215                 code = UCOL_REORDER_CODE_DEFAULT;  // -1
1216             } else {
1217                 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1218                 infoln(fileLine);
1219                 errorCode.set(U_PARSE_ERROR);
1220                 return;
1221             }
1222         }
1223         reorderCodes.addElement(code, errorCode);
1224         start = limit;
1225     }
1226     if(coll != NULL) {
1227         coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1228         if(errorCode.isFailure()) {
1229             errln("setReorderCodes() failed on line %d: %s",
1230                   (int)fileLineNumber, errorCode.errorName());
1231             infoln(fileLine);
1232             return;
1233         }
1234     }
1235     fileLine.remove();
1236 }
1237 
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1238 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1239     UnicodeString rules;
1240     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1241         rules.append(fileLine.unescape());
1242     }
1243     if(errorCode.isFailure()) { return; }
1244     logln(rules);
1245 
1246     UParseError parseError;
1247     UnicodeString reason;
1248     delete coll;
1249     coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1250     if(coll == NULL) {
1251         errln("unable to allocate a new collator");
1252         errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1253         return;
1254     }
1255     if(errorCode.isFailure()) {
1256         dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1257         infoln(UnicodeString("  reason: ") + reason);
1258         if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1259         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1260             infoln(UnicodeString("  snippet: ...") +
1261                 parseError.preContext + "(!)" + parseError.postContext + "...");
1262         }
1263         delete coll;
1264         coll = NULL;
1265         errorCode.reset();
1266     } else {
1267         assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1268                      UnicodeString(), reason);
1269     }
1270 }
1271 
setRootCollator(IcuTestErrorCode & errorCode)1272 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1273     if(errorCode.isFailure()) { return; }
1274     delete coll;
1275     coll = Collator::createInstance(Locale::getRoot(), errorCode);
1276     if(errorCode.isFailure()) {
1277         dataerrln("unable to create a root collator");
1278         return;
1279     }
1280 }
1281 
setLocaleCollator(IcuTestErrorCode & errorCode)1282 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1283     if(errorCode.isFailure()) { return; }
1284     delete coll;
1285     coll = NULL;
1286     int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1287     if(at >= 0) {
1288         fileLine.setCharAt(at, (UChar)0x2a);  // *
1289     }
1290     CharString localeID;
1291     localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1292     if(at >= 0) {
1293         localeID.data()[at - 9] = '@';
1294     }
1295     Locale locale(localeID.data());
1296     if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1297         errln("invalid language tag on line %d", (int)fileLineNumber);
1298         infoln(fileLine);
1299         if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1300         return;
1301     }
1302 
1303     logln("creating a collator for locale ID %s", locale.getName());
1304     coll = Collator::createInstance(locale, errorCode);
1305     if(errorCode.isFailure()) {
1306         dataerrln("unable to create a collator for locale %s on line %d",
1307                   locale.getName(), (int)fileLineNumber);
1308         infoln(fileLine);
1309         delete coll;
1310         coll = NULL;
1311         errorCode.reset();
1312     }
1313 }
1314 
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1315 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1316     if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return true; }
1317     // In some sequences with Tibetan composite vowel signs,
1318     // even if the string passes the FCD check,
1319     // those composites must be decomposed.
1320     // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1321     int32_t index = 0;
1322     while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1323         if(++index < s.length()) {
1324             UChar c = s[index];
1325             if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return true; }
1326         }
1327     }
1328     return false;
1329 }
1330 
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1331 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1332                                      CharString &dest, int32_t partSize,
1333                                      IcuTestErrorCode &errorCode) {
1334     if(errorCode.isFailure()) { return false; }
1335     uint8_t part[32];
1336     U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1337     UCharIterator iter;
1338     uiter_setString(&iter, s, length);
1339     uint32_t state[2] = { 0, 0 };
1340     for(;;) {
1341         int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1342         UBool done = partLength < partSize;
1343         if(done) {
1344             // At the end, append the next byte as well which should be 00.
1345             ++partLength;
1346         }
1347         dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1348         if(done) {
1349             return errorCode.isSuccess();
1350         }
1351     }
1352 }
1353 
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1354 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1355                                      const UChar *s, int32_t length,
1356                                      CollationKey &key, IcuTestErrorCode &errorCode) {
1357     if(errorCode.isFailure()) { return false; }
1358     coll->getCollationKey(s, length, key, errorCode);
1359     if(errorCode.isFailure()) {
1360         infoln(fileTestName);
1361         errln("Collator(%s).getCollationKey() failed: %s",
1362               norm, errorCode.errorName());
1363         infoln(line);
1364         return false;
1365     }
1366     int32_t keyLength;
1367     const uint8_t *keyBytes = key.getByteArray(keyLength);
1368     if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1369         infoln(fileTestName);
1370         errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1371               norm);
1372         infoln(line);
1373         infoln(printCollationKey(key));
1374         return false;
1375     }
1376 
1377     int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1378     if(numLevels < UCOL_IDENTICAL) {
1379         ++numLevels;
1380     } else {
1381         numLevels = 5;
1382     }
1383     if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1384         ++numLevels;
1385     }
1386     errorCode.assertSuccess();
1387     int32_t numLevelSeparators = 0;
1388     for(int32_t i = 0; i < (keyLength - 1); ++i) {
1389         uint8_t b = keyBytes[i];
1390         if(b == 0) {
1391             infoln(fileTestName);
1392             errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1393             infoln(line);
1394             infoln(printCollationKey(key));
1395             return false;
1396         }
1397         if(b == 1) { ++numLevelSeparators; }
1398     }
1399     if(numLevelSeparators != (numLevels - 1)) {
1400         infoln(fileTestName);
1401         errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1402               norm, (int)numLevelSeparators, (int)numLevels);
1403         infoln(line);
1404         infoln(printCollationKey(key));
1405         return false;
1406     }
1407 
1408     // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1409     static const int32_t partSizes[] = { 32, 3, 1 };
1410     for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1411         int32_t partSize = partSizes[psi];
1412         CharString parts;
1413         if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1414             infoln(fileTestName);
1415             errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1416                   norm, (int)partSize, errorCode.errorName());
1417             infoln(line);
1418             return false;
1419         }
1420         if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1421             infoln(fileTestName);
1422             errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1423                   norm, (int)partSize);
1424             infoln(line);
1425             infoln(printCollationKey(key));
1426             infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1427             return false;
1428         }
1429     }
1430     return true;
1431 }
1432 
1433 /**
1434  * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1435  * Leaves key unchanged if s does not contain U+FFFE.
1436  * @return true if the key was successfully changed
1437  */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1438 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1439                                            CollationKey &key, IcuTestErrorCode &errorCode) {
1440     if(errorCode.isFailure()) { return false; }
1441     LocalMemory<uint8_t> mergedKey;
1442     int32_t mergedKeyLength = 0;
1443     int32_t mergedKeyCapacity = 0;
1444     int32_t sLength = (length >= 0) ? length : u_strlen(s);
1445     int32_t segmentStart = 0;
1446     for(int32_t i = 0;;) {
1447         if(i == sLength) {
1448             if(segmentStart == 0) {
1449                 // s does not contain any U+FFFE.
1450                 return false;
1451             }
1452         } else if(s[i] != 0xfffe) {
1453             ++i;
1454             continue;
1455         }
1456         // Get the sort key for another segment and merge it into mergedKey.
1457         CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1458         CollationKey key2;
1459         coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1460         int32_t key1Length, key2Length;
1461         const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1462         const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1463         uint8_t *dest;
1464         int32_t minCapacity = key1Length + key2Length;
1465         if(key1Length > 0) { --minCapacity; }
1466         if(minCapacity <= mergedKeyCapacity) {
1467             dest = mergedKey.getAlias();
1468         } else {
1469             if(minCapacity <= 200) {
1470                 mergedKeyCapacity = 200;
1471             } else if(minCapacity <= 2 * mergedKeyCapacity) {
1472                 mergedKeyCapacity *= 2;
1473             } else {
1474                 mergedKeyCapacity = minCapacity;
1475             }
1476             dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1477         }
1478         U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1479         if(key1Length == 0) {
1480             // key2 is the sort key for the first segment.
1481             uprv_memcpy(dest, key2Bytes, key2Length);
1482             mergedKeyLength = key2Length;
1483         } else {
1484             mergedKeyLength =
1485                 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1486                                    dest, mergedKeyCapacity);
1487         }
1488         if(i == sLength) { break; }
1489         segmentStart = ++i;
1490     }
1491     key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1492     return true;
1493 }
1494 
1495 namespace {
1496 
1497 /**
1498  * Replaces unpaired surrogates with U+FFFD.
1499  * Returns s if no replacement was made, otherwise buffer.
1500  */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1501 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1502     int32_t i = 0;
1503     while(i < s.length()) {
1504         UChar32 c = s.char32At(i);
1505         if(U_IS_SURROGATE(c)) {
1506             if(buffer.length() < i) {
1507                 buffer.append(s, buffer.length(), i - buffer.length());
1508             }
1509             buffer.append((UChar)0xfffd);
1510         }
1511         i += U16_LENGTH(c);
1512     }
1513     if(buffer.isEmpty()) {
1514         return s;
1515     }
1516     if(buffer.length() < i) {
1517         buffer.append(s, buffer.length(), i - buffer.length());
1518     }
1519     return buffer;
1520 }
1521 
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1522 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1523                            UCollationResult order, UBool collHasCaseLevel) {
1524     if(order == UCOL_EQUAL) {
1525         return Collation::NO_LEVEL;
1526     }
1527     int32_t prevKeyLength;
1528     const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1529     int32_t keyLength;
1530     const uint8_t *bytes = key.getByteArray(keyLength);
1531     int32_t level = Collation::PRIMARY_LEVEL;
1532     for(int32_t i = 0;; ++i) {
1533         uint8_t b = prevBytes[i];
1534         if(b != bytes[i]) { break; }
1535         if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1536             ++level;
1537             if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1538                 ++level;
1539             }
1540         }
1541     }
1542     return level;
1543 }
1544 
1545 }
1546 
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1547 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1548                                      const UnicodeString &prevString, const UnicodeString &s,
1549                                      UCollationResult expectedOrder, Collation::Level expectedLevel,
1550                                      IcuTestErrorCode &errorCode) {
1551     if(errorCode.isFailure()) { return false; }
1552 
1553     // Get the sort keys first, for error debug output.
1554     CollationKey prevKey;
1555     if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1556                         prevKey, errorCode)) {
1557         return false;
1558     }
1559     CollationKey key;
1560     if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return false; }
1561 
1562     UCollationResult order = coll->compare(prevString, s, errorCode);
1563     if(order != expectedOrder || errorCode.isFailure()) {
1564         infoln(fileTestName);
1565         errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1566               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1567         infoln(prevFileLine);
1568         infoln(fileLine);
1569         infoln(printCollationKey(prevKey));
1570         infoln(printCollationKey(key));
1571         return false;
1572     }
1573     order = coll->compare(s, prevString, errorCode);
1574     if(order != -expectedOrder || errorCode.isFailure()) {
1575         infoln(fileTestName);
1576         errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1577               (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1578         infoln(prevFileLine);
1579         infoln(fileLine);
1580         infoln(printCollationKey(prevKey));
1581         infoln(printCollationKey(key));
1582         return false;
1583     }
1584     // Test NUL-termination if the strings do not contain NUL characters.
1585     UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1586     if(!containNUL) {
1587         order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1588         if(order != expectedOrder || errorCode.isFailure()) {
1589             infoln(fileTestName);
1590             errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1591                   (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1592             infoln(prevFileLine);
1593             infoln(fileLine);
1594             infoln(printCollationKey(prevKey));
1595             infoln(printCollationKey(key));
1596             return false;
1597         }
1598         order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1599         if(order != -expectedOrder || errorCode.isFailure()) {
1600             infoln(fileTestName);
1601             errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1602                   (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1603             infoln(prevFileLine);
1604             infoln(fileLine);
1605             infoln(printCollationKey(prevKey));
1606             infoln(printCollationKey(key));
1607             return false;
1608         }
1609     }
1610 
1611     // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1612     // Unpaired surrogates cannot be converted to UTF-8.
1613     // Create valid UTF-16 strings if necessary, and use those for
1614     // both the expected compare() result and for the input to compare(UTF-8).
1615     UnicodeString prevBuffer, sBuffer;
1616     const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1617     const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1618     std::string prevUTF8, sUTF8;
1619     UnicodeString(prevValid).toUTF8String(prevUTF8);
1620     UnicodeString(sValid).toUTF8String(sUTF8);
1621     UCollationResult expectedUTF8Order;
1622     if(&prevValid == &prevString && &sValid == &s) {
1623         expectedUTF8Order = expectedOrder;
1624     } else {
1625         expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1626     }
1627 
1628     order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1629     if(order != expectedUTF8Order || errorCode.isFailure()) {
1630         infoln(fileTestName);
1631         errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1632               (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1633         infoln(prevFileLine);
1634         infoln(fileLine);
1635         infoln(printCollationKey(prevKey));
1636         infoln(printCollationKey(key));
1637         return false;
1638     }
1639     order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1640     if(order != -expectedUTF8Order || errorCode.isFailure()) {
1641         infoln(fileTestName);
1642         errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1643               (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1644         infoln(prevFileLine);
1645         infoln(fileLine);
1646         infoln(printCollationKey(prevKey));
1647         infoln(printCollationKey(key));
1648         return false;
1649     }
1650     // Test NUL-termination if the strings do not contain NUL characters.
1651     if(!containNUL) {
1652         order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1653         if(order != expectedUTF8Order || errorCode.isFailure()) {
1654             infoln(fileTestName);
1655             errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1656                   (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1657             infoln(prevFileLine);
1658             infoln(fileLine);
1659             infoln(printCollationKey(prevKey));
1660             infoln(printCollationKey(key));
1661             return false;
1662         }
1663         order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1664         if(order != -expectedUTF8Order || errorCode.isFailure()) {
1665             infoln(fileTestName);
1666             errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1667                   (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1668             infoln(prevFileLine);
1669             infoln(fileLine);
1670             infoln(printCollationKey(prevKey));
1671             infoln(printCollationKey(key));
1672             return false;
1673         }
1674     }
1675 
1676     UCharIterator leftIter;
1677     UCharIterator rightIter;
1678     uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1679     uiter_setString(&rightIter, s.getBuffer(), s.length());
1680     order = coll->compare(leftIter, rightIter, errorCode);
1681     if(order != expectedOrder || errorCode.isFailure()) {
1682         infoln(fileTestName);
1683         errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1684               "wrong order: %d != %d (%s)",
1685               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1686         infoln(prevFileLine);
1687         infoln(fileLine);
1688         infoln(printCollationKey(prevKey));
1689         infoln(printCollationKey(key));
1690         return false;
1691     }
1692 
1693     order = prevKey.compareTo(key, errorCode);
1694     if(order != expectedOrder || errorCode.isFailure()) {
1695         infoln(fileTestName);
1696         errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1697               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1698         infoln(prevFileLine);
1699         infoln(fileLine);
1700         infoln(printCollationKey(prevKey));
1701         infoln(printCollationKey(key));
1702         return false;
1703     }
1704     UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1705     int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1706     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1707         if(level != expectedLevel) {
1708             infoln(fileTestName);
1709             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1710                   (int)fileLineNumber, norm, order, level, expectedLevel);
1711             infoln(prevFileLine);
1712             infoln(fileLine);
1713             infoln(printCollationKey(prevKey));
1714             infoln(printCollationKey(key));
1715             return false;
1716         }
1717     }
1718 
1719     // If either string contains U+FFFE, then their sort keys must compare the same as
1720     // the merged sort keys of each string's between-FFFE segments.
1721     //
1722     // It is not required that
1723     //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1724     // only that those two methods yield the same order.
1725     //
1726     // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1727     if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1728                 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1729             errorCode.isFailure()) {
1730         order = prevKey.compareTo(key, errorCode);
1731         if(order != expectedOrder || errorCode.isFailure()) {
1732             infoln(fileTestName);
1733             errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1734                 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1735                 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1736             infoln(prevFileLine);
1737             infoln(fileLine);
1738             infoln(printCollationKey(prevKey));
1739             infoln(printCollationKey(key));
1740             return false;
1741         }
1742         int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1743         if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1744             if(mergedLevel != level) {
1745                 infoln(fileTestName);
1746                 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1747                     "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1748                     (int)fileLineNumber, norm, order, mergedLevel, level);
1749                 infoln(prevFileLine);
1750                 infoln(fileLine);
1751                 infoln(printCollationKey(prevKey));
1752                 infoln(printCollationKey(key));
1753                 return false;
1754             }
1755         }
1756     }
1757     return true;
1758 }
1759 
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1760 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1761     if(errorCode.isFailure()) { return; }
1762     UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1763     UnicodeString prevString, s;
1764     prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1765     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1766         // Parse the line even if it will be ignored (when we do not have a Collator)
1767         // in order to report syntax issues.
1768         Collation::Level relation = parseRelationAndString(s, errorCode);
1769         if(errorCode.isFailure()) {
1770             errorCode.reset();
1771             break;
1772         }
1773         if(coll == NULL) {
1774             // We were unable to create the Collator but continue with tests.
1775             // Ignore test data for this Collator.
1776             // The next Collator creation might work.
1777             continue;
1778         }
1779         UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1780         Collation::Level expectedLevel = relation;
1781         s.getTerminatedBuffer();  // Ensure NUL-termination.
1782         UBool isOk = true;
1783         if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1784             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1785             isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1786                                    expectedOrder, expectedLevel, errorCode);
1787         }
1788         if(isOk) {
1789             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1790             isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1791                                    expectedOrder, expectedLevel, errorCode);
1792         }
1793         if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1794             UnicodeString pn = nfd->normalize(prevString, errorCode);
1795             UnicodeString n = nfd->normalize(s, errorCode);
1796             pn.getTerminatedBuffer();
1797             n.getTerminatedBuffer();
1798             errorCode.assertSuccess();
1799             isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1800                                    expectedOrder, expectedLevel, errorCode);
1801         }
1802         if(!isOk) {
1803             errorCode.reset();  // already reported
1804         }
1805         prevFileLine = fileLine;
1806         prevString = s;
1807         prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1808     }
1809 }
1810 
TestDataDriven()1811 void CollationTest::TestDataDriven() {
1812     IcuTestErrorCode errorCode(*this, "TestDataDriven");
1813 
1814     fcd = Normalizer2Factory::getFCDInstance(errorCode);
1815     nfd = Normalizer2::getNFDInstance(errorCode);
1816     if(errorCode.errDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1817         return;
1818     }
1819 
1820     CharString path(getSourceTestData(errorCode), errorCode);
1821     path.appendPathPart("collationtest.txt", errorCode);
1822     const char *codePage = "UTF-8";
1823     LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, true, false, errorCode));
1824     if(errorCode.errIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1825         return;
1826     }
1827     // Read a new line if necessary.
1828     // Sub-parsers leave the first line set that they do not handle.
1829     while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1830         if(!isSectionStarter(fileLine[0])) {
1831             errln("syntax error on line %d", (int)fileLineNumber);
1832             infoln(fileLine);
1833             return;
1834         }
1835         if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1836             fileTestName = fileLine;
1837             logln(fileLine);
1838             fileLine.remove();
1839         } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1840             setRootCollator(errorCode);
1841             fileLine.remove();
1842         } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1843             setLocaleCollator(errorCode);
1844             fileLine.remove();
1845         } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1846             buildTailoring(f.getAlias(), errorCode);
1847         } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1848             parseAndSetAttribute(errorCode);
1849         } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1850             checkCompareStrings(f.getAlias(), errorCode);
1851         } else {
1852             errln("syntax error on line %d", (int)fileLineNumber);
1853             infoln(fileLine);
1854             return;
1855         }
1856     }
1857 }
1858 
TestLongLocale()1859 void CollationTest::TestLongLocale() {
1860     IcuTestErrorCode errorCode(*this, "TestLongLocale");
1861     Locale longLocale("sie__1G_C_CEIE_CEZCX_CSUE_E_EIESZNI2_GB_LM_LMCSUE_LMCSX_"
1862                       "LVARIANT_MMCSIE_STEU_SU1GCEIE_SU6G_SU6SU6G_U_UBGE_UC_"
1863                       "UCEZCSI_UCIE_UZSIU_VARIANT_X@collation=bcs-ukvsz");
1864     LocalPointer<Collator> coll(Collator::createInstance(longLocale, errorCode));
1865 }
1866 
TestBuilderContextsOverflow()1867 void CollationTest::TestBuilderContextsOverflow() {
1868     IcuTestErrorCode errorCode(*this, "TestBuilderContextsOverflow");
1869     // ICU-20715: Bad memory access in what looks like a bogus CharsTrie after
1870     // intermediate contextual-mappings data overflowed.
1871     // Caused by the CollationDataBuilder using some outdated values when building
1872     // contextual mappings with both prefix and contraction matching.
1873     // Fixed by resetting those outdated values before code looks at them.
1874     char16_t rules[] = {
1875         u'&', 0x10, 0x2ff, 0x503c, 0x4617,
1876         u'=', 0x80, 0x4f7f, 0xff, 0x3c3d, 0x1c4f, 0x3c3c,
1877         u'<', 0, 0, 0, 0, u'|', 0, 0, 0, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f, 0xff,
1878         u'=', 0, u'|', 0, 0, 0, 0, 0, 0, 0x1f00, 0xe30,
1879         0x3035, 0, 0, 0xd200, 0, 0x7f00, 0xff4f, 0x3d00, 0, 0x7c00,
1880         0, 0, 0, 0, 0, 0, 0, 0x301f, 0x350e, 0x30,
1881         0, 0, 0xd2, 0x7c00, 0, 0, 0, 0, 0, 0,
1882         0, 0x301f, 0x350e, 0x30, 0, 0, 0x52d2, 0x2f3c, 0x5552, 0x493c,
1883         0x1f10, 0x1f50, 0x300, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f,
1884         0xff,
1885         u'=', 0, u'|', 0, 0, 0, 0, 0x5000, 0x4617,
1886         u'=', 0x80, 0x4f7f, 0, 0, 0xd200, 0
1887     };
1888     UnicodeString s(false, rules, UPRV_LENGTHOF(rules));
1889     LocalPointer<Collator> coll(new RuleBasedCollator(s, errorCode), errorCode);
1890     if(errorCode.isSuccess()) {
1891         logln("successfully built the Collator");
1892     }
1893 }
1894 
1895 #endif  // !UCONFIG_NO_COLLATION
1896