• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/stringpiece.h"
26 #include "unicode/tblcoll.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uniset.h"
29 #include "unicode/unistr.h"
30 #include "unicode/usetiter.h"
31 #include "unicode/ustring.h"
32 #include "charstr.h"
33 #include "cmemory.h"
34 #include "collation.h"
35 #include "collationdata.h"
36 #include "collationfcd.h"
37 #include "collationiterator.h"
38 #include "collationroot.h"
39 #include "collationrootelements.h"
40 #include "collationruleparser.h"
41 #include "collationweights.h"
42 #include "cstring.h"
43 #include "intltest.h"
44 #include "normalizer2impl.h"
45 #include "ucbuf.h"
46 #include "uhash.h"
47 #include "uitercollationiterator.h"
48 #include "utf16collationiterator.h"
49 #include "utf8collationiterator.h"
50 #include "uvectr32.h"
51 #include "uvectr64.h"
52 #include "writesrc.h"
53 
54 class CodePointIterator;
55 
56 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
57 
58 class CollationTest : public IntlTest {
59 public:
CollationTest()60     CollationTest()
61             : fcd(NULL), nfd(NULL),
62               fileLineNumber(0),
63               coll(NULL) {}
64 
~CollationTest()65     ~CollationTest() {
66         delete coll;
67     }
68 
69     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
70 
71     void TestMinMax();
72     void TestImplicits();
73     void TestNulTerminated();
74     void TestIllegalUTF8();
75     void TestShortFCDData();
76     void TestFCD();
77     void TestCollationWeights();
78     void TestRootElements();
79     void TestTailoredElements();
80     void TestDataDriven();
81     void TestLongLocale();
82 
83 private:
84     void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
85     void checkAllocWeights(CollationWeights &cw,
86                            uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
87                            int32_t someLength, int32_t minCount);
88 
89     static UnicodeString printSortKey(const uint8_t *p, int32_t length);
90     static UnicodeString printCollationKey(const CollationKey &key);
91 
92     // Helpers & fields for data-driven test.
isCROrLF(UChar c)93     static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)94     static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)95     static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
skipSpaces(int32_t i)96     int32_t skipSpaces(int32_t i) {
97         while(isSpace(fileLine[i])) { ++i; }
98         return i;
99     }
100 
101     UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
102     void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
103     Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
104     void parseAndSetAttribute(IcuTestErrorCode &errorCode);
105     void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
106     void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
107     void setRootCollator(IcuTestErrorCode &errorCode);
108     void setLocaleCollator(IcuTestErrorCode &errorCode);
109 
110     UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
111 
112     UBool getSortKeyParts(const UChar *s, int32_t length,
113                           CharString &dest, int32_t partSize,
114                           IcuTestErrorCode &errorCode);
115     UBool getCollationKey(const char *norm, const UnicodeString &line,
116                           const UChar *s, int32_t length,
117                           CollationKey &key, IcuTestErrorCode &errorCode);
118     UBool getMergedCollationKey(const UChar *s, int32_t length,
119                                 CollationKey &key, IcuTestErrorCode &errorCode);
120     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
121                           const UnicodeString &prevString, const UnicodeString &s,
122                           UCollationResult expectedOrder, Collation::Level expectedLevel,
123                           IcuTestErrorCode &errorCode);
124     void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
125 
126     const Normalizer2 *fcd, *nfd;
127     UnicodeString fileLine;
128     int32_t fileLineNumber;
129     UnicodeString fileTestName;
130     Collator *coll;
131 };
132 
createCollationTest()133 extern IntlTest *createCollationTest() {
134     return new CollationTest();
135 }
136 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)137 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
138     if(exec) {
139         logln("TestSuite CollationTest: ");
140     }
141     TESTCASE_AUTO_BEGIN;
142     TESTCASE_AUTO(TestMinMax);
143     TESTCASE_AUTO(TestImplicits);
144     TESTCASE_AUTO(TestNulTerminated);
145     TESTCASE_AUTO(TestIllegalUTF8);
146     TESTCASE_AUTO(TestShortFCDData);
147     TESTCASE_AUTO(TestFCD);
148     TESTCASE_AUTO(TestCollationWeights);
149     TESTCASE_AUTO(TestRootElements);
150     TESTCASE_AUTO(TestTailoredElements);
151     TESTCASE_AUTO(TestDataDriven);
152     TESTCASE_AUTO(TestLongLocale);
153     TESTCASE_AUTO_END;
154 }
155 
TestMinMax()156 void CollationTest::TestMinMax() {
157     IcuTestErrorCode errorCode(*this, "TestMinMax");
158 
159     setRootCollator(errorCode);
160     if(errorCode.isFailure()) {
161         errorCode.reset();
162         return;
163     }
164     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
165     if(rbc == NULL) {
166         errln("the root collator is not a RuleBasedCollator");
167         return;
168     }
169 
170     static const UChar s[2] = { 0xfffe, 0xffff };
171     UVector64 ces(errorCode);
172     rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
173     errorCode.assertSuccess();
174     if(ces.size() != 2) {
175         errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
176         return;
177     }
178     int64_t ce = ces.elementAti(0);
179     int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
180     if(ce != expected) {
181         errln("CE(U+fffe)=%04lx != 02..", (long)ce);
182     }
183 
184     ce = ces.elementAti(1);
185     expected = Collation::makeCE(Collation::MAX_PRIMARY);
186     if(ce != expected) {
187         errln("CE(U+ffff)=%04lx != max..", (long)ce);
188     }
189 }
190 
TestImplicits()191 void CollationTest::TestImplicits() {
192     IcuTestErrorCode errorCode(*this, "TestImplicits");
193 
194     const CollationData *cd = CollationRoot::getData(errorCode);
195     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
196         return;
197     }
198 
199     // Implicit primary weights should be assigned for the following sets,
200     // and sort in ascending order by set and then code point.
201     // See http://www.unicode.org/reports/tr10/#Implicit_Weights
202 
203     // core Han Unified Ideographs
204     UnicodeSet coreHan("[\\p{unified_ideograph}&"
205                             "[\\p{Block=CJK_Unified_Ideographs}"
206                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
207                        errorCode);
208     // all other Unified Han ideographs
209     UnicodeSet otherHan("[\\p{unified ideograph}-"
210                             "[\\p{Block=CJK_Unified_Ideographs}"
211                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
212                         errorCode);
213     UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
214     unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
215 
216     // Starting with CLDR 26/ICU 54, the root Han order may instead be
217     // the Unihan radical-stroke order.
218     // The tests should pass either way, so we only test the order of a small set of Han characters
219     // whose radical-stroke order is the same as their code point order.
220     UnicodeSet someHanInCPOrder(
221             "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
222             "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
223             errorCode);
224     UnicodeSet inOrder(someHanInCPOrder);
225     inOrder.addAll(unassigned).freeze();
226     if(errorCode.errIfFailureAndReset("UnicodeSet")) {
227         return;
228     }
229     const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
230     UChar32 prev = 0;
231     uint32_t prevPrimary = 0;
232     UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
233     for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
234         LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
235         while(iter->next()) {
236             UChar32 c = iter->getCodepoint();
237             UnicodeString s(c);
238             ci.setText(s.getBuffer(), s.getBuffer() + s.length());
239             int64_t ce = ci.nextCE(errorCode);
240             int64_t ce2 = ci.nextCE(errorCode);
241             if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
242                 return;
243             }
244             if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
245                 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
246                 continue;
247             }
248             if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
249                 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
250                       (long)c, (long)(ce & 0xffffffff));
251                 continue;
252             }
253             uint32_t primary = (uint32_t)(ce >> 32);
254             if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
255                 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
256                       (long)c, (long)primary, (long)prev, (long)prevPrimary);
257             }
258             prev = c;
259             prevPrimary = primary;
260         }
261     }
262 }
263 
TestNulTerminated()264 void CollationTest::TestNulTerminated() {
265     IcuTestErrorCode errorCode(*this, "TestNulTerminated");
266     const CollationData *data = CollationRoot::getData(errorCode);
267     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
268         return;
269     }
270 
271     static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
272 
273     UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
274     UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
275     for(int32_t i = 0;; ++i) {
276         int64_t ce1 = ci1.nextCE(errorCode);
277         int64_t ce2 = ci2.nextCE(errorCode);
278         if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
279             return;
280         }
281         if(ce1 != ce2) {
282             errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
283             break;
284         }
285         if(ce1 == Collation::NO_CE) { break; }
286     }
287 }
288 
TestIllegalUTF8()289 void CollationTest::TestIllegalUTF8() {
290     IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
291 
292     setRootCollator(errorCode);
293     if(errorCode.isFailure()) {
294         errorCode.reset();
295         return;
296     }
297     coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
298 
299     static const StringPiece strings[] = {
300         // string with U+FFFD == illegal byte sequence
301         u8"a\uFFFDz",                   "a\x80z",  // trail byte
302         u8"a\uFFFD\uFFFDz",             "a\xc1\x81z",  // non-shortest form
303         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xe0\x82\x83z",  // non-shortest form
304         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
305         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
306         u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz",  // non-shortest form
307         u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
308     };
309 
310     for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
311         StringPiece fffd(strings[i]);
312         StringPiece illegal(strings[i + 1]);
313         UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
314         if(order != UCOL_EQUAL) {
315             errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
316                   (int)i, order);
317         }
318     }
319 }
320 
321 namespace {
322 
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)323 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
324     for(UChar32 c = 0x10000; c < 0x110000;) {
325         UChar32 next = c + 0x400;
326         if(src.containsSome(c, next - 1)) {
327             dest.add(U16_LEAD(c));
328         }
329         c = next;
330     }
331 }
332 
333 }  // namespace
334 
TestShortFCDData()335 void CollationTest::TestShortFCDData() {
336     // See CollationFCD class comments.
337     IcuTestErrorCode errorCode(*this, "TestShortFCDData");
338     UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
339     errorCode.assertSuccess();
340     expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
341     addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
342     UnicodeSet lccc;  // actual
343     for(UChar32 c = 0; c <= 0xffff; ++c) {
344         if(CollationFCD::hasLccc(c)) { lccc.add(c); }
345     }
346     UnicodeSet diff(expectedLccc);
347     diff.removeAll(lccc);
348     diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
349     UnicodeString empty("[]");
350     UnicodeString diffString;
351     diff.toPattern(diffString, TRUE);
352     assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
353     diff = lccc;
354     diff.removeAll(expectedLccc);
355     diff.toPattern(diffString, TRUE);
356     assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
357 
358     UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
359     if (errorCode.isSuccess()) {
360         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
361         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
362         UnicodeSet tccc;  // actual
363         for(UChar32 c = 0; c <= 0xffff; ++c) {
364             if(CollationFCD::hasTccc(c)) { tccc.add(c); }
365         }
366         diff = expectedTccc;
367         diff.removeAll(tccc);
368         diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
369         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
370         diff = tccc;
371         diff.removeAll(expectedTccc);
372         diff.toPattern(diffString, TRUE);
373         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
374     }
375 }
376 
377 class CodePointIterator {
378 public:
CodePointIterator(const UChar32 * cp,int32_t length)379     CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()380     void resetToStart() { pos = 0; }
next()381     UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()382     UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const383     int32_t getLength() const { return length; }
getIndex() const384     int getIndex() const { return (int)pos; }
385 private:
386     const UChar32 *cp;
387     int32_t length;
388     int32_t pos;
389 };
390 
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)391 void CollationTest::checkFCD(const char *name,
392                              CollationIterator &ci, CodePointIterator &cpi) {
393     IcuTestErrorCode errorCode(*this, "checkFCD");
394 
395     // Iterate forward to the limit.
396     for(;;) {
397         UChar32 c1 = ci.nextCodePoint(errorCode);
398         UChar32 c2 = cpi.next();
399         if(c1 != c2) {
400             errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
401                   name, (long)c1, (long)c2, cpi.getIndex());
402             return;
403         }
404         if(c1 < 0) { break; }
405     }
406 
407     // Iterate backward most of the way.
408     for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
409         UChar32 c1 = ci.previousCodePoint(errorCode);
410         UChar32 c2 = cpi.previous();
411         if(c1 != c2) {
412             errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
413                   name, (long)c1, (long)c2, cpi.getIndex());
414             return;
415         }
416     }
417 
418     // Forward again.
419     for(;;) {
420         UChar32 c1 = ci.nextCodePoint(errorCode);
421         UChar32 c2 = cpi.next();
422         if(c1 != c2) {
423             errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
424                   name, (long)c1, (long)c2, cpi.getIndex());
425             return;
426         }
427         if(c1 < 0) { break; }
428     }
429 
430     // Iterate backward to the start.
431     for(;;) {
432         UChar32 c1 = ci.previousCodePoint(errorCode);
433         UChar32 c2 = cpi.previous();
434         if(c1 != c2) {
435             errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
436                   name, (long)c1, (long)c2, cpi.getIndex());
437             return;
438         }
439         if(c1 < 0) { break; }
440     }
441 }
442 
TestFCD()443 void CollationTest::TestFCD() {
444     IcuTestErrorCode errorCode(*this, "TestFCD");
445     const CollationData *data = CollationRoot::getData(errorCode);
446     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
447         return;
448     }
449 
450     // Input string, not FCD, NUL-terminated.
451     static const UChar s[] = {
452         0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
453         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
454         0x327, 0x308,  // ccc=202, 230
455         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
456         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
457         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
458         0xac01,
459         0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
460         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
461         0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
462         0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
463         0x4e00, 0xf81,
464         0
465     };
466     // Expected code points.
467     static const UChar32 cp[] = {
468         0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
469         0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
470         0x1D15F, 0x1D16D,
471         0xac01,
472         0x63, 0x327, 0x1D165, 0x1D16D,
473         0x61,
474         0xf71, 0xf71, 0xf72, 0xf74, 0x301,
475         0x4e00, 0xf71, 0xf80
476     };
477 
478     FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
479     if(errorCode.errIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
480         return;
481     }
482     CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
483     checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
484 
485     cpi.resetToStart();
486     std::string utf8;
487     UnicodeString(s).toUTF8String(utf8);
488     FCDUTF8CollationIterator u8ci(data, FALSE,
489                                   reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
490     if(errorCode.errIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
491         return;
492     }
493     checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
494 
495     cpi.resetToStart();
496     UCharIterator iter;
497     uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL
498     FCDUIterCollationIterator uici(data, FALSE, iter, 0);
499     if(errorCode.errIfFailureAndReset("FCDUIterCollationIterator constructor")) {
500         return;
501     }
502     checkFCD("FCDUIterCollationIterator", uici, cpi);
503 }
504 
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)505 void CollationTest::checkAllocWeights(CollationWeights &cw,
506                                       uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
507                                       int32_t someLength, int32_t minCount) {
508     if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
509         errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
510               (long)lowerLimit, (long)upperLimit, (long)n);
511         return;
512     }
513     uint32_t previous = lowerLimit;
514     int32_t count = 0;  // number of weights that have someLength
515     for(int32_t i = 0; i < n; ++i) {
516         uint32_t w = cw.nextWeight();
517         if(w == 0xffffffff) {
518             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
519                   "returns only %ld weights",
520                   (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
521             return;
522         }
523         if(!(previous < w && w < upperLimit)) {
524             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
525                   "number %ld -> %lx not between %lx and %lx",
526                   (long)lowerLimit, (long)upperLimit, (long)n,
527                   (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
528             return;
529         }
530         if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
531     }
532     if(count < minCount) {
533         errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
534               "returns only %ld < %ld weights of length %d",
535               (long)lowerLimit, (long)upperLimit, (long)n,
536               (long)count, (long)minCount, (int)someLength);
537     }
538 }
539 
TestCollationWeights()540 void CollationTest::TestCollationWeights() {
541     CollationWeights cw;
542 
543     // Non-compressible primaries use 254 second bytes 02..FF.
544     logln("CollationWeights.initForPrimary(non-compressible)");
545     cw.initForPrimary(FALSE);
546     // Expect 1 weight 11 and 254 weights 12xx.
547     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
548     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
549     // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
550     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
551     // Expect 254 two-byte weights from the ranges 10ff and 11xx.
552     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
553     // Expect 254^2=64516 three-byte weights.
554     // During computation, there should be 3 three-byte ranges
555     // 10ffff, 11xxxx, 120202.
556     // The middle one should be split 64515:1,
557     // and the newly-split-off range and the last ranged lengthened.
558     checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
559     // Expect weights 1102 & 1103.
560     checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
561     // Expect weights 102102 & 102103.
562     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
563 
564     // Compressible primaries use 251 second bytes 04..FE.
565     logln("CollationWeights.initForPrimary(compressible)");
566     cw.initForPrimary(TRUE);
567     // Expect 1 weight 11 and 251 weights 12xx.
568     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
569     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
570     // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
571     checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
572     // Expect weights 1104 & 1105.
573     checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
574     // Expect weights 102102 & 102103.
575     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
576 
577     // Secondary and tertiary weights use only bytes 3 & 4.
578     logln("CollationWeights.initForSecondary()");
579     cw.initForSecondary();
580     // Expect weights fbxx and all four fc..ff.
581     checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
582 
583     logln("CollationWeights.initForTertiary()");
584     cw.initForTertiary();
585     // Expect weights 3dxx and both 3e & 3f.
586     checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
587 }
588 
589 namespace {
590 
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)591 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
592                 uint32_t p, uint32_t s, uint32_t ctq) {
593     uint32_t p1 = p >> 24;
594     uint32_t p2 = (p >> 16) & 0xff;
595     uint32_t p3 = (p >> 8) & 0xff;
596     uint32_t p4 = p & 0xff;
597     uint32_t s1 = s >> 8;
598     uint32_t s2 = s & 0xff;
599     // ctq = Case, Tertiary, Quaternary
600     uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
601     uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
602     uint32_t t1 = t >> 8;
603     uint32_t t2 = t & 0xff;
604     uint32_t q = ctq & Collation::QUATERNARY_MASK;
605     // No leading zero bytes.
606     if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
607         return FALSE;
608     }
609     // No intermediate zero bytes.
610     if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
611         return FALSE;
612     }
613     if(p2 != 0 && p3 == 0 && p4 != 0) {
614         return FALSE;
615     }
616     // Minimum & maximum lead bytes.
617     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
618             s1 == Collation::LEVEL_SEPARATOR_BYTE ||
619             t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
620         return FALSE;
621     }
622     if(c > 2) {
623         return FALSE;
624     }
625     // The valid byte range for the second primary byte depends on compressibility.
626     if(p2 != 0) {
627         if(data.isCompressibleLeadByte(p1)) {
628             if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
629                     Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
630                 return FALSE;
631             }
632         } else {
633             if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
634                 return FALSE;
635             }
636         }
637     }
638     // Other bytes just need to avoid the level separator.
639     // Trailing zeros are ok.
640     U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
641     if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
642             s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
643         return FALSE;
644     }
645     // Well-formed CEs.
646     if(p == 0) {
647         if(s == 0) {
648             if(t == 0) {
649                 // Completely ignorable CE.
650                 // Quaternary CEs are not supported.
651                 if(c != 0 || q != 0) {
652                     return FALSE;
653                 }
654             } else {
655                 // Tertiary CE.
656                 if(t < re.getTertiaryBoundary() || c != 2) {
657                     return FALSE;
658                 }
659             }
660         } else {
661             // Secondary CE.
662             if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
663                 return FALSE;
664             }
665         }
666     } else {
667         // Primary CE.
668         if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
669                 s >= re.getSecondaryBoundary()) {
670             return FALSE;
671         }
672         if(t == 0 || t >= re.getTertiaryBoundary()) {
673             return FALSE;
674         }
675     }
676     return TRUE;
677 }
678 
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)679 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
680     uint32_t p = (uint32_t)(ce >> 32);
681     uint32_t secTer = (uint32_t)ce;
682     return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
683 }
684 
685 class RootElementsIterator {
686 public:
RootElementsIterator(const CollationData & root)687     RootElementsIterator(const CollationData &root)
688             : data(root),
689               elements(root.rootElements), length(root.rootElementsLength),
690               pri(0), secTer(0),
691               index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
692 
next()693     UBool next() {
694         if(index >= length) { return FALSE; }
695         uint32_t p = elements[index];
696         if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
697         if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
698             ++index;
699             secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
700             return TRUE;
701         }
702         if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
703             // End of a range, enumerate the primaries in the range.
704             int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
705             p &= 0xffffff00;
706             if(pri == p) {
707                 // Finished the range, return the next CE after it.
708                 ++index;
709                 return next();
710             }
711             U_ASSERT(pri < p);
712             // Return the next primary in this range.
713             UBool isCompressible = data.isCompressiblePrimary(pri);
714             if((pri & 0xffff) == 0) {
715                 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
716             } else {
717                 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
718             }
719             return TRUE;
720         }
721         // Simple primary CE.
722         ++index;
723         pri = p;
724         // Does this have an explicit below-common sec/ter unit,
725         // or does it imply a common one?
726         if(index == length) {
727             secTer = Collation::COMMON_SEC_AND_TER_CE;
728         } else {
729             secTer = elements[index];
730             if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
731                 // No sec/ter delta.
732                 secTer = Collation::COMMON_SEC_AND_TER_CE;
733             } else {
734                 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
735                 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
736                     // Implied sec/ter.
737                     secTer = Collation::COMMON_SEC_AND_TER_CE;
738                 } else {
739                     // Explicit sec/ter below common/common.
740                     ++index;
741                 }
742             }
743         }
744         return TRUE;
745     }
746 
getPrimary() const747     uint32_t getPrimary() const { return pri; }
getSecTer() const748     uint32_t getSecTer() const { return secTer; }
749 
750 private:
751     const CollationData &data;
752     const uint32_t *elements;
753     int32_t length;
754 
755     uint32_t pri;
756     uint32_t secTer;
757     int32_t index;
758 };
759 
760 }  // namespace
761 
TestRootElements()762 void CollationTest::TestRootElements() {
763     IcuTestErrorCode errorCode(*this, "TestRootElements");
764     const CollationData *root = CollationRoot::getData(errorCode);
765     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
766         return;
767     }
768     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
769     RootElementsIterator iter(*root);
770 
771     // We check each root CE for validity,
772     // and we also verify that there is a tailoring gap between each two CEs.
773     CollationWeights cw1c;  // compressible primary weights
774     CollationWeights cw1u;  // uncompressible primary weights
775     CollationWeights cw2;
776     CollationWeights cw3;
777 
778     cw1c.initForPrimary(TRUE);
779     cw1u.initForPrimary(FALSE);
780     cw2.initForSecondary();
781     cw3.initForTertiary();
782 
783     // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
784     // nor the special merge-separator CE for U+FFFE.
785     uint32_t prevPri = 0;
786     uint32_t prevSec = 0;
787     uint32_t prevTer = 0;
788     while(iter.next()) {
789         uint32_t pri = iter.getPrimary();
790         uint32_t secTer = iter.getSecTer();
791         // CollationRootElements CEs must have 0 case and quaternary bits.
792         if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
793             errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
794                   (long)pri, (long)secTer);
795         }
796         uint32_t sec = secTer >> 16;
797         uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
798         uint32_t ctq = ter;
799         if(pri == 0 && sec == 0 && ter != 0) {
800             // Tertiary CEs must have uppercase bits,
801             // but they are not stored in the CollationRootElements.
802             ctq |= 0x8000;
803         }
804         if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
805             errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
806         } else {
807             if(pri != prevPri) {
808                 uint32_t newWeight = 0;
809                 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
810                     // There is currently no tailoring gap after primary ignorables,
811                     // and we forbid tailoring after U+FFFD and U+FFFF.
812                 } else if(root->isCompressiblePrimary(prevPri)) {
813                     if(!cw1c.allocWeights(prevPri, pri, 1)) {
814                         errln("no primary/compressible tailoring gap between %08lx and %08lx",
815                               (long)prevPri, (long)pri);
816                     } else {
817                         newWeight = cw1c.nextWeight();
818                     }
819                 } else {
820                     if(!cw1u.allocWeights(prevPri, pri, 1)) {
821                         errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
822                               (long)prevPri, (long)pri);
823                     } else {
824                         newWeight = cw1u.nextWeight();
825                     }
826                 }
827                 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
828                     errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
829                           (long)prevPri, (long)newWeight, (long)pri);
830                 }
831             } else if(sec != prevSec) {
832                 uint32_t lowerLimit =
833                     prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
834                 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
835                     errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
836                 } else {
837                     uint32_t newWeight = cw2.nextWeight();
838                     if(!(prevSec < newWeight && newWeight < sec)) {
839                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
840                               (long)lowerLimit, (long)newWeight, (long)sec);
841                     }
842                 }
843             } else if(ter != prevTer) {
844                 uint32_t lowerLimit =
845                     prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
846                 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
847                     errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
848                 } else {
849                     uint32_t newWeight = cw3.nextWeight();
850                     if(!(prevTer < newWeight && newWeight < ter)) {
851                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
852                               (long)lowerLimit, (long)newWeight, (long)ter);
853                     }
854                 }
855             } else {
856                 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
857             }
858         }
859         prevPri = pri;
860         prevSec = sec;
861         prevTer = ter;
862     }
863 }
864 
TestTailoredElements()865 void CollationTest::TestTailoredElements() {
866     IcuTestErrorCode errorCode(*this, "TestTailoredElements");
867     const CollationData *root = CollationRoot::getData(errorCode);
868     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
869         return;
870     }
871     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
872 
873     UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
874     if(errorCode.errIfFailureAndReset("failed to create a hash table")) {
875         return;
876     }
877     uhash_setKeyDeleter(prevLocales, uprv_free);
878     // TestRootElements() tests the root collator which does not have tailorings.
879     uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
880     uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
881     uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
882 
883     UVector64 ces(errorCode);
884     LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
885     U_ASSERT(locales.isValid());
886     const char *localeID = "root";
887     do {
888         Locale locale(localeID);
889         LocalPointer<StringEnumeration> types(
890                 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
891         errorCode.assertSuccess();
892         const char *type;  // first: default type
893         while((type = types->next(NULL, errorCode)) != NULL) {
894             if(strncmp(type, "private-", 8) == 0) {
895                 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
896                         localeID, type);
897             }
898             Locale localeWithType(locale);
899             localeWithType.setKeywordValue("collation", type, errorCode);
900             errorCode.assertSuccess();
901             LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
902             if(errorCode.errIfFailureAndReset("Collator::createInstance(%s)",
903                                               localeWithType.getName())) {
904                 continue;
905             }
906             Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
907             if(uhash_geti(prevLocales, actual.getName()) != 0) {
908                 continue;
909             }
910             uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
911             errorCode.assertSuccess();
912             logln("TestTailoredElements(): requested %s -> actual %s",
913                   localeWithType.getName(), actual.getName());
914             RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
915             if(rbc == NULL) {
916                 continue;
917             }
918             // Note: It would be better to get tailored strings such that we can
919             // identify the prefix, and only get the CEs for the prefix+string,
920             // not also for the prefix.
921             // There is currently no API for that.
922             // It would help in an unusual case where a contraction starting in the prefix
923             // extends past its end, and we do not see the intended mapping.
924             // For example, for a mapping p|st, if there is also a contraction ps,
925             // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
926             LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
927             errorCode.assertSuccess();
928             UnicodeSetIterator iter(*tailored);
929             while(iter.next()) {
930                 const UnicodeString &s = iter.getString();
931                 ces.removeAllElements();
932                 rbc->internalGetCEs(s, ces, errorCode);
933                 errorCode.assertSuccess();
934                 for(int32_t i = 0; i < ces.size(); ++i) {
935                     int64_t ce = ces.elementAti(i);
936                     if(!isValidCE(rootElements, *root, ce)) {
937                         errln("invalid tailored CE %016llx at CE index %d from string:",
938                               (long long)ce, (int)i);
939                         infoln(prettify(s));
940                     }
941                 }
942             }
943         }
944     } while((localeID = locales->next(NULL, errorCode)) != NULL);
945     uhash_close(prevLocales);
946 }
947 
printSortKey(const uint8_t * p,int32_t length)948 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
949     UnicodeString s;
950     for(int32_t i = 0; i < length; ++i) {
951         if(i > 0) { s.append((UChar)0x20); }
952         uint8_t b = p[i];
953         if(b == 0) {
954             s.append((UChar)0x2e);  // period
955         } else if(b == 1) {
956             s.append((UChar)0x7c);  // vertical bar
957         } else {
958             appendHex(b, 2, s);
959         }
960     }
961     return s;
962 }
963 
printCollationKey(const CollationKey & key)964 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
965     int32_t length;
966     const uint8_t *p = key.getByteArray(length);
967     return printSortKey(p, length);
968 }
969 
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)970 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
971     for(;;) {
972         int32_t lineLength;
973         const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
974         if(line == NULL || errorCode.isFailure()) {
975             fileLine.remove();
976             return FALSE;
977         }
978         ++fileLineNumber;
979         // Strip trailing CR/LF, comments, and spaces.
980         const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
981         if(comment != NULL) {
982             lineLength = (int32_t)(comment - line);
983         } else {
984             while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
985         }
986         while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
987         if(lineLength != 0) {
988             fileLine.setTo(FALSE, line, lineLength);
989             return TRUE;
990         }
991         // Empty line, continue.
992     }
993 }
994 
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)995 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
996                                 UErrorCode &errorCode) {
997     int32_t length = fileLine.length();
998     int32_t i;
999     for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
1000     int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
1001     if(pipeIndex >= 0) {
1002         prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1003         if(prefix.isEmpty()) {
1004             errln("empty prefix on line %d", (int)fileLineNumber);
1005             infoln(fileLine);
1006             errorCode = U_PARSE_ERROR;
1007             return;
1008         }
1009         start = pipeIndex + 1;
1010     } else {
1011         prefix.remove();
1012     }
1013     s = fileLine.tempSubStringBetween(start, i).unescape();
1014     if(s.isEmpty()) {
1015         errln("empty string on line %d", (int)fileLineNumber);
1016         infoln(fileLine);
1017         errorCode = U_PARSE_ERROR;
1018         return;
1019     }
1020     start = i;
1021 }
1022 
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1023 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1024     Collation::Level relation;
1025     int32_t start;
1026     if(fileLine[0] == 0x3c) {  // <
1027         UChar second = fileLine[1];
1028         start = 2;
1029         switch(second) {
1030         case 0x31:  // <1
1031             relation = Collation::PRIMARY_LEVEL;
1032             break;
1033         case 0x32:  // <2
1034             relation = Collation::SECONDARY_LEVEL;
1035             break;
1036         case 0x33:  // <3
1037             relation = Collation::TERTIARY_LEVEL;
1038             break;
1039         case 0x34:  // <4
1040             relation = Collation::QUATERNARY_LEVEL;
1041             break;
1042         case 0x63:  // <c
1043             relation = Collation::CASE_LEVEL;
1044             break;
1045         case 0x69:  // <i
1046             relation = Collation::IDENTICAL_LEVEL;
1047             break;
1048         default:  // just <
1049             relation = Collation::NO_LEVEL;
1050             start = 1;
1051             break;
1052         }
1053     } else if(fileLine[0] == 0x3d) {  // =
1054         relation = Collation::ZERO_LEVEL;
1055         start = 1;
1056     } else {
1057         start = 0;
1058     }
1059     if(start == 0 || !isSpace(fileLine[start])) {
1060         errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1061         infoln(fileLine);
1062         errorCode.set(U_PARSE_ERROR);
1063         return Collation::NO_LEVEL;
1064     }
1065     start = skipSpaces(start);
1066     UnicodeString prefix;
1067     parseString(start, prefix, s, errorCode);
1068     if(errorCode.isSuccess() && !prefix.isEmpty()) {
1069         errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1070         infoln(fileLine);
1071         errorCode.set(U_PARSE_ERROR);
1072         return Collation::NO_LEVEL;
1073     }
1074     if(start < fileLine.length()) {
1075         errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1076         infoln(fileLine);
1077         errorCode.set(U_PARSE_ERROR);
1078         return Collation::NO_LEVEL;
1079     }
1080     return relation;
1081 }
1082 
1083 static const struct {
1084     const char *name;
1085     UColAttribute attr;
1086 } attributes[] = {
1087     { "backwards", UCOL_FRENCH_COLLATION },
1088     { "alternate", UCOL_ALTERNATE_HANDLING },
1089     { "caseFirst", UCOL_CASE_FIRST },
1090     { "caseLevel", UCOL_CASE_LEVEL },
1091     // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1092     { "strength", UCOL_STRENGTH },
1093     // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1094     { "numeric", UCOL_NUMERIC_COLLATION }
1095 };
1096 
1097 static const struct {
1098     const char *name;
1099     UColAttributeValue value;
1100 } attributeValues[] = {
1101     { "default", UCOL_DEFAULT },
1102     { "primary", UCOL_PRIMARY },
1103     { "secondary", UCOL_SECONDARY },
1104     { "tertiary", UCOL_TERTIARY },
1105     { "quaternary", UCOL_QUATERNARY },
1106     { "identical", UCOL_IDENTICAL },
1107     { "off", UCOL_OFF },
1108     { "on", UCOL_ON },
1109     { "shifted", UCOL_SHIFTED },
1110     { "non-ignorable", UCOL_NON_IGNORABLE },
1111     { "lower", UCOL_LOWER_FIRST },
1112     { "upper", UCOL_UPPER_FIRST }
1113 };
1114 
parseAndSetAttribute(IcuTestErrorCode & errorCode)1115 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1116     // Parse attributes even if the Collator could not be created,
1117     // in order to report syntax errors.
1118     int32_t start = skipSpaces(1);
1119     int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1120     if(equalPos < 0) {
1121         if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1122             parseAndSetReorderCodes(start + 7, errorCode);
1123             return;
1124         }
1125         errln("missing '=' on line %d", (int)fileLineNumber);
1126         infoln(fileLine);
1127         errorCode.set(U_PARSE_ERROR);
1128         return;
1129     }
1130 
1131     UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1132     UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1133     if(attrString == UNICODE_STRING("maxVariable", 11)) {
1134         UColReorderCode max;
1135         if(valueString == UNICODE_STRING("space", 5)) {
1136             max = UCOL_REORDER_CODE_SPACE;
1137         } else if(valueString == UNICODE_STRING("punct", 5)) {
1138             max = UCOL_REORDER_CODE_PUNCTUATION;
1139         } else if(valueString == UNICODE_STRING("symbol", 6)) {
1140             max = UCOL_REORDER_CODE_SYMBOL;
1141         } else if(valueString == UNICODE_STRING("currency", 8)) {
1142             max = UCOL_REORDER_CODE_CURRENCY;
1143         } else {
1144             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1145             infoln(fileLine);
1146             errorCode.set(U_PARSE_ERROR);
1147             return;
1148         }
1149         if(coll != NULL) {
1150             coll->setMaxVariable(max, errorCode);
1151             if(errorCode.isFailure()) {
1152                 errln("setMaxVariable() failed on line %d: %s",
1153                       (int)fileLineNumber, errorCode.errorName());
1154                 infoln(fileLine);
1155                 return;
1156             }
1157         }
1158         fileLine.remove();
1159         return;
1160     }
1161 
1162     UColAttribute attr;
1163     for(int32_t i = 0;; ++i) {
1164         if(i == UPRV_LENGTHOF(attributes)) {
1165             errln("invalid attribute name on line %d", (int)fileLineNumber);
1166             infoln(fileLine);
1167             errorCode.set(U_PARSE_ERROR);
1168             return;
1169         }
1170         if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1171             attr = attributes[i].attr;
1172             break;
1173         }
1174     }
1175 
1176     UColAttributeValue value;
1177     for(int32_t i = 0;; ++i) {
1178         if(i == UPRV_LENGTHOF(attributeValues)) {
1179             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1180             infoln(fileLine);
1181             errorCode.set(U_PARSE_ERROR);
1182             return;
1183         }
1184         if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1185             value = attributeValues[i].value;
1186             break;
1187         }
1188     }
1189 
1190     if(coll != NULL) {
1191         coll->setAttribute(attr, value, errorCode);
1192         if(errorCode.isFailure()) {
1193             errln("illegal attribute=value combination on line %d: %s",
1194                   (int)fileLineNumber, errorCode.errorName());
1195             infoln(fileLine);
1196             return;
1197         }
1198     }
1199     fileLine.remove();
1200 }
1201 
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1202 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1203     UVector32 reorderCodes(errorCode);
1204     while(start < fileLine.length()) {
1205         start = skipSpaces(start);
1206         int32_t limit = start;
1207         while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1208         CharString name;
1209         name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1210         int32_t code = CollationRuleParser::getReorderCode(name.data());
1211         if(code < 0) {
1212             if(uprv_stricmp(name.data(), "default") == 0) {
1213                 code = UCOL_REORDER_CODE_DEFAULT;  // -1
1214             } else {
1215                 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1216                 infoln(fileLine);
1217                 errorCode.set(U_PARSE_ERROR);
1218                 return;
1219             }
1220         }
1221         reorderCodes.addElement(code, errorCode);
1222         start = limit;
1223     }
1224     if(coll != NULL) {
1225         coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1226         if(errorCode.isFailure()) {
1227             errln("setReorderCodes() failed on line %d: %s",
1228                   (int)fileLineNumber, errorCode.errorName());
1229             infoln(fileLine);
1230             return;
1231         }
1232     }
1233     fileLine.remove();
1234 }
1235 
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1236 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1237     UnicodeString rules;
1238     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1239         rules.append(fileLine.unescape());
1240     }
1241     if(errorCode.isFailure()) { return; }
1242     logln(rules);
1243 
1244     UParseError parseError;
1245     UnicodeString reason;
1246     delete coll;
1247     coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1248     if(coll == NULL) {
1249         errln("unable to allocate a new collator");
1250         errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1251         return;
1252     }
1253     if(errorCode.isFailure()) {
1254         dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1255         infoln(UnicodeString("  reason: ") + reason);
1256         if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1257         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1258             infoln(UnicodeString("  snippet: ...") +
1259                 parseError.preContext + "(!)" + parseError.postContext + "...");
1260         }
1261         delete coll;
1262         coll = NULL;
1263         errorCode.reset();
1264     } else {
1265         assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1266                      UnicodeString(), reason);
1267     }
1268 }
1269 
setRootCollator(IcuTestErrorCode & errorCode)1270 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1271     if(errorCode.isFailure()) { return; }
1272     delete coll;
1273     coll = Collator::createInstance(Locale::getRoot(), errorCode);
1274     if(errorCode.isFailure()) {
1275         dataerrln("unable to create a root collator");
1276         return;
1277     }
1278 }
1279 
setLocaleCollator(IcuTestErrorCode & errorCode)1280 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1281     if(errorCode.isFailure()) { return; }
1282     delete coll;
1283     coll = NULL;
1284     int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1285     if(at >= 0) {
1286         fileLine.setCharAt(at, (UChar)0x2a);  // *
1287     }
1288     CharString localeID;
1289     localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1290     if(at >= 0) {
1291         localeID.data()[at - 9] = '@';
1292     }
1293     Locale locale(localeID.data());
1294     if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1295         errln("invalid language tag on line %d", (int)fileLineNumber);
1296         infoln(fileLine);
1297         if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1298         return;
1299     }
1300 
1301     logln("creating a collator for locale ID %s", locale.getName());
1302     coll = Collator::createInstance(locale, errorCode);
1303     if(errorCode.isFailure()) {
1304         dataerrln("unable to create a collator for locale %s on line %d",
1305                   locale.getName(), (int)fileLineNumber);
1306         infoln(fileLine);
1307         delete coll;
1308         coll = NULL;
1309         errorCode.reset();
1310     }
1311 }
1312 
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1313 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1314     if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1315     // In some sequences with Tibetan composite vowel signs,
1316     // even if the string passes the FCD check,
1317     // those composites must be decomposed.
1318     // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1319     int32_t index = 0;
1320     while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1321         if(++index < s.length()) {
1322             UChar c = s[index];
1323             if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1324         }
1325     }
1326     return FALSE;
1327 }
1328 
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1329 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1330                                      CharString &dest, int32_t partSize,
1331                                      IcuTestErrorCode &errorCode) {
1332     if(errorCode.isFailure()) { return FALSE; }
1333     uint8_t part[32];
1334     U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1335     UCharIterator iter;
1336     uiter_setString(&iter, s, length);
1337     uint32_t state[2] = { 0, 0 };
1338     for(;;) {
1339         int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1340         UBool done = partLength < partSize;
1341         if(done) {
1342             // At the end, append the next byte as well which should be 00.
1343             ++partLength;
1344         }
1345         dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1346         if(done) {
1347             return errorCode.isSuccess();
1348         }
1349     }
1350 }
1351 
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1352 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1353                                      const UChar *s, int32_t length,
1354                                      CollationKey &key, IcuTestErrorCode &errorCode) {
1355     if(errorCode.isFailure()) { return FALSE; }
1356     coll->getCollationKey(s, length, key, errorCode);
1357     if(errorCode.isFailure()) {
1358         infoln(fileTestName);
1359         errln("Collator(%s).getCollationKey() failed: %s",
1360               norm, errorCode.errorName());
1361         infoln(line);
1362         return FALSE;
1363     }
1364     int32_t keyLength;
1365     const uint8_t *keyBytes = key.getByteArray(keyLength);
1366     if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1367         infoln(fileTestName);
1368         errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1369               norm);
1370         infoln(line);
1371         infoln(printCollationKey(key));
1372         return FALSE;
1373     }
1374 
1375     int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1376     if(numLevels < UCOL_IDENTICAL) {
1377         ++numLevels;
1378     } else {
1379         numLevels = 5;
1380     }
1381     if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1382         ++numLevels;
1383     }
1384     errorCode.assertSuccess();
1385     int32_t numLevelSeparators = 0;
1386     for(int32_t i = 0; i < (keyLength - 1); ++i) {
1387         uint8_t b = keyBytes[i];
1388         if(b == 0) {
1389             infoln(fileTestName);
1390             errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1391             infoln(line);
1392             infoln(printCollationKey(key));
1393             return FALSE;
1394         }
1395         if(b == 1) { ++numLevelSeparators; }
1396     }
1397     if(numLevelSeparators != (numLevels - 1)) {
1398         infoln(fileTestName);
1399         errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1400               norm, (int)numLevelSeparators, (int)numLevels);
1401         infoln(line);
1402         infoln(printCollationKey(key));
1403         return FALSE;
1404     }
1405 
1406     // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1407     static const int32_t partSizes[] = { 32, 3, 1 };
1408     for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1409         int32_t partSize = partSizes[psi];
1410         CharString parts;
1411         if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1412             infoln(fileTestName);
1413             errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1414                   norm, (int)partSize, errorCode.errorName());
1415             infoln(line);
1416             return FALSE;
1417         }
1418         if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1419             infoln(fileTestName);
1420             errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1421                   norm, (int)partSize);
1422             infoln(line);
1423             infoln(printCollationKey(key));
1424             infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1425             return FALSE;
1426         }
1427     }
1428     return TRUE;
1429 }
1430 
1431 /**
1432  * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1433  * Leaves key unchanged if s does not contain U+FFFE.
1434  * @return TRUE if the key was successfully changed
1435  */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1436 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1437                                            CollationKey &key, IcuTestErrorCode &errorCode) {
1438     if(errorCode.isFailure()) { return FALSE; }
1439     LocalMemory<uint8_t> mergedKey;
1440     int32_t mergedKeyLength = 0;
1441     int32_t mergedKeyCapacity = 0;
1442     int32_t sLength = (length >= 0) ? length : u_strlen(s);
1443     int32_t segmentStart = 0;
1444     for(int32_t i = 0;;) {
1445         if(i == sLength) {
1446             if(segmentStart == 0) {
1447                 // s does not contain any U+FFFE.
1448                 return FALSE;
1449             }
1450         } else if(s[i] != 0xfffe) {
1451             ++i;
1452             continue;
1453         }
1454         // Get the sort key for another segment and merge it into mergedKey.
1455         CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1456         CollationKey key2;
1457         coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1458         int32_t key1Length, key2Length;
1459         const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1460         const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1461         uint8_t *dest;
1462         int32_t minCapacity = key1Length + key2Length;
1463         if(key1Length > 0) { --minCapacity; }
1464         if(minCapacity <= mergedKeyCapacity) {
1465             dest = mergedKey.getAlias();
1466         } else {
1467             if(minCapacity <= 200) {
1468                 mergedKeyCapacity = 200;
1469             } else if(minCapacity <= 2 * mergedKeyCapacity) {
1470                 mergedKeyCapacity *= 2;
1471             } else {
1472                 mergedKeyCapacity = minCapacity;
1473             }
1474             dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1475         }
1476         U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1477         if(key1Length == 0) {
1478             // key2 is the sort key for the first segment.
1479             uprv_memcpy(dest, key2Bytes, key2Length);
1480             mergedKeyLength = key2Length;
1481         } else {
1482             mergedKeyLength =
1483                 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1484                                    dest, mergedKeyCapacity);
1485         }
1486         if(i == sLength) { break; }
1487         segmentStart = ++i;
1488     }
1489     key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1490     return TRUE;
1491 }
1492 
1493 namespace {
1494 
1495 /**
1496  * Replaces unpaired surrogates with U+FFFD.
1497  * Returns s if no replacement was made, otherwise buffer.
1498  */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1499 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1500     int32_t i = 0;
1501     while(i < s.length()) {
1502         UChar32 c = s.char32At(i);
1503         if(U_IS_SURROGATE(c)) {
1504             if(buffer.length() < i) {
1505                 buffer.append(s, buffer.length(), i - buffer.length());
1506             }
1507             buffer.append((UChar)0xfffd);
1508         }
1509         i += U16_LENGTH(c);
1510     }
1511     if(buffer.isEmpty()) {
1512         return s;
1513     }
1514     if(buffer.length() < i) {
1515         buffer.append(s, buffer.length(), i - buffer.length());
1516     }
1517     return buffer;
1518 }
1519 
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1520 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1521                            UCollationResult order, UBool collHasCaseLevel) {
1522     if(order == UCOL_EQUAL) {
1523         return Collation::NO_LEVEL;
1524     }
1525     int32_t prevKeyLength;
1526     const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1527     int32_t keyLength;
1528     const uint8_t *bytes = key.getByteArray(keyLength);
1529     int32_t level = Collation::PRIMARY_LEVEL;
1530     for(int32_t i = 0;; ++i) {
1531         uint8_t b = prevBytes[i];
1532         if(b != bytes[i]) { break; }
1533         if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1534             ++level;
1535             if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1536                 ++level;
1537             }
1538         }
1539     }
1540     return level;
1541 }
1542 
1543 }
1544 
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1545 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1546                                      const UnicodeString &prevString, const UnicodeString &s,
1547                                      UCollationResult expectedOrder, Collation::Level expectedLevel,
1548                                      IcuTestErrorCode &errorCode) {
1549     if(errorCode.isFailure()) { return FALSE; }
1550 
1551     // Get the sort keys first, for error debug output.
1552     CollationKey prevKey;
1553     if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1554                         prevKey, errorCode)) {
1555         return FALSE;
1556     }
1557     CollationKey key;
1558     if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1559 
1560     UCollationResult order = coll->compare(prevString, s, errorCode);
1561     if(order != expectedOrder || errorCode.isFailure()) {
1562         infoln(fileTestName);
1563         errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1564               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1565         infoln(prevFileLine);
1566         infoln(fileLine);
1567         infoln(printCollationKey(prevKey));
1568         infoln(printCollationKey(key));
1569         return FALSE;
1570     }
1571     order = coll->compare(s, prevString, errorCode);
1572     if(order != -expectedOrder || errorCode.isFailure()) {
1573         infoln(fileTestName);
1574         errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1575               (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1576         infoln(prevFileLine);
1577         infoln(fileLine);
1578         infoln(printCollationKey(prevKey));
1579         infoln(printCollationKey(key));
1580         return FALSE;
1581     }
1582     // Test NUL-termination if the strings do not contain NUL characters.
1583     UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1584     if(!containNUL) {
1585         order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1586         if(order != expectedOrder || errorCode.isFailure()) {
1587             infoln(fileTestName);
1588             errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1589                   (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1590             infoln(prevFileLine);
1591             infoln(fileLine);
1592             infoln(printCollationKey(prevKey));
1593             infoln(printCollationKey(key));
1594             return FALSE;
1595         }
1596         order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1597         if(order != -expectedOrder || errorCode.isFailure()) {
1598             infoln(fileTestName);
1599             errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1600                   (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1601             infoln(prevFileLine);
1602             infoln(fileLine);
1603             infoln(printCollationKey(prevKey));
1604             infoln(printCollationKey(key));
1605             return FALSE;
1606         }
1607     }
1608 
1609     // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1610     // Unpaired surrogates cannot be converted to UTF-8.
1611     // Create valid UTF-16 strings if necessary, and use those for
1612     // both the expected compare() result and for the input to compare(UTF-8).
1613     UnicodeString prevBuffer, sBuffer;
1614     const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1615     const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1616     std::string prevUTF8, sUTF8;
1617     UnicodeString(prevValid).toUTF8String(prevUTF8);
1618     UnicodeString(sValid).toUTF8String(sUTF8);
1619     UCollationResult expectedUTF8Order;
1620     if(&prevValid == &prevString && &sValid == &s) {
1621         expectedUTF8Order = expectedOrder;
1622     } else {
1623         expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1624     }
1625 
1626     order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1627     if(order != expectedUTF8Order || errorCode.isFailure()) {
1628         infoln(fileTestName);
1629         errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1630               (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1631         infoln(prevFileLine);
1632         infoln(fileLine);
1633         infoln(printCollationKey(prevKey));
1634         infoln(printCollationKey(key));
1635         return FALSE;
1636     }
1637     order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1638     if(order != -expectedUTF8Order || errorCode.isFailure()) {
1639         infoln(fileTestName);
1640         errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1641               (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1642         infoln(prevFileLine);
1643         infoln(fileLine);
1644         infoln(printCollationKey(prevKey));
1645         infoln(printCollationKey(key));
1646         return FALSE;
1647     }
1648     // Test NUL-termination if the strings do not contain NUL characters.
1649     if(!containNUL) {
1650         order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1651         if(order != expectedUTF8Order || errorCode.isFailure()) {
1652             infoln(fileTestName);
1653             errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1654                   (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1655             infoln(prevFileLine);
1656             infoln(fileLine);
1657             infoln(printCollationKey(prevKey));
1658             infoln(printCollationKey(key));
1659             return FALSE;
1660         }
1661         order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1662         if(order != -expectedUTF8Order || errorCode.isFailure()) {
1663             infoln(fileTestName);
1664             errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1665                   (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1666             infoln(prevFileLine);
1667             infoln(fileLine);
1668             infoln(printCollationKey(prevKey));
1669             infoln(printCollationKey(key));
1670             return FALSE;
1671         }
1672     }
1673 
1674     UCharIterator leftIter;
1675     UCharIterator rightIter;
1676     uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1677     uiter_setString(&rightIter, s.getBuffer(), s.length());
1678     order = coll->compare(leftIter, rightIter, errorCode);
1679     if(order != expectedOrder || errorCode.isFailure()) {
1680         infoln(fileTestName);
1681         errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1682               "wrong order: %d != %d (%s)",
1683               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1684         infoln(prevFileLine);
1685         infoln(fileLine);
1686         infoln(printCollationKey(prevKey));
1687         infoln(printCollationKey(key));
1688         return FALSE;
1689     }
1690 
1691     order = prevKey.compareTo(key, errorCode);
1692     if(order != expectedOrder || errorCode.isFailure()) {
1693         infoln(fileTestName);
1694         errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1695               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1696         infoln(prevFileLine);
1697         infoln(fileLine);
1698         infoln(printCollationKey(prevKey));
1699         infoln(printCollationKey(key));
1700         return FALSE;
1701     }
1702     UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1703     int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1704     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1705         if(level != expectedLevel) {
1706             infoln(fileTestName);
1707             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1708                   (int)fileLineNumber, norm, order, level, expectedLevel);
1709             infoln(prevFileLine);
1710             infoln(fileLine);
1711             infoln(printCollationKey(prevKey));
1712             infoln(printCollationKey(key));
1713             return FALSE;
1714         }
1715     }
1716 
1717     // If either string contains U+FFFE, then their sort keys must compare the same as
1718     // the merged sort keys of each string's between-FFFE segments.
1719     //
1720     // It is not required that
1721     //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1722     // only that those two methods yield the same order.
1723     //
1724     // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1725     if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1726                 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1727             errorCode.isFailure()) {
1728         order = prevKey.compareTo(key, errorCode);
1729         if(order != expectedOrder || errorCode.isFailure()) {
1730             infoln(fileTestName);
1731             errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1732                 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1733                 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1734             infoln(prevFileLine);
1735             infoln(fileLine);
1736             infoln(printCollationKey(prevKey));
1737             infoln(printCollationKey(key));
1738             return FALSE;
1739         }
1740         int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1741         if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1742             if(mergedLevel != level) {
1743                 infoln(fileTestName);
1744                 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1745                     "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1746                     (int)fileLineNumber, norm, order, mergedLevel, level);
1747                 infoln(prevFileLine);
1748                 infoln(fileLine);
1749                 infoln(printCollationKey(prevKey));
1750                 infoln(printCollationKey(key));
1751                 return FALSE;
1752             }
1753         }
1754     }
1755     return TRUE;
1756 }
1757 
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1758 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1759     if(errorCode.isFailure()) { return; }
1760     UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1761     UnicodeString prevString, s;
1762     prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1763     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1764         // Parse the line even if it will be ignored (when we do not have a Collator)
1765         // in order to report syntax issues.
1766         Collation::Level relation = parseRelationAndString(s, errorCode);
1767         if(errorCode.isFailure()) {
1768             errorCode.reset();
1769             break;
1770         }
1771         if(coll == NULL) {
1772             // We were unable to create the Collator but continue with tests.
1773             // Ignore test data for this Collator.
1774             // The next Collator creation might work.
1775             continue;
1776         }
1777         UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1778         Collation::Level expectedLevel = relation;
1779         s.getTerminatedBuffer();  // Ensure NUL-termination.
1780         UBool isOk = TRUE;
1781         if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1782             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1783             isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1784                                    expectedOrder, expectedLevel, errorCode);
1785         }
1786         if(isOk) {
1787             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1788             isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1789                                    expectedOrder, expectedLevel, errorCode);
1790         }
1791         if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1792             UnicodeString pn = nfd->normalize(prevString, errorCode);
1793             UnicodeString n = nfd->normalize(s, errorCode);
1794             pn.getTerminatedBuffer();
1795             n.getTerminatedBuffer();
1796             errorCode.assertSuccess();
1797             isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1798                                    expectedOrder, expectedLevel, errorCode);
1799         }
1800         if(!isOk) {
1801             errorCode.reset();  // already reported
1802         }
1803         prevFileLine = fileLine;
1804         prevString = s;
1805         prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1806     }
1807 }
1808 
TestDataDriven()1809 void CollationTest::TestDataDriven() {
1810     IcuTestErrorCode errorCode(*this, "TestDataDriven");
1811 
1812     fcd = Normalizer2Factory::getFCDInstance(errorCode);
1813     nfd = Normalizer2::getNFDInstance(errorCode);
1814     if(errorCode.errDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1815         return;
1816     }
1817 
1818     CharString path(getSourceTestData(errorCode), errorCode);
1819     path.appendPathPart("collationtest.txt", errorCode);
1820     const char *codePage = "UTF-8";
1821     LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1822     if(errorCode.errIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1823         return;
1824     }
1825     // Read a new line if necessary.
1826     // Sub-parsers leave the first line set that they do not handle.
1827     while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1828         if(!isSectionStarter(fileLine[0])) {
1829             errln("syntax error on line %d", (int)fileLineNumber);
1830             infoln(fileLine);
1831             return;
1832         }
1833         if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1834             fileTestName = fileLine;
1835             logln(fileLine);
1836             fileLine.remove();
1837         } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1838             setRootCollator(errorCode);
1839             fileLine.remove();
1840         } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1841             setLocaleCollator(errorCode);
1842             fileLine.remove();
1843         } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1844             buildTailoring(f.getAlias(), errorCode);
1845         } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1846             parseAndSetAttribute(errorCode);
1847         } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1848             checkCompareStrings(f.getAlias(), errorCode);
1849         } else {
1850             errln("syntax error on line %d", (int)fileLineNumber);
1851             infoln(fileLine);
1852             return;
1853         }
1854     }
1855 }
1856 
TestLongLocale()1857 void CollationTest::TestLongLocale() {
1858     IcuTestErrorCode errorCode(*this, "TestLongLocale");
1859     Locale longLocale("sie__1G_C_CEIE_CEZCX_CSUE_E_EIESZNI2_GB_LM_LMCSUE_LMCSX_"
1860                       "LVARIANT_MMCSIE_STEU_SU1GCEIE_SU6G_SU6SU6G_U_UBGE_UC_"
1861                       "UCEZCSI_UCIE_UZSIU_VARIANT_X@collation=bcs-ukvsz");
1862     LocalPointer<Collator> coll(Collator::createInstance(longLocale, errorCode));
1863 }
1864 
1865 #endif  // !UCONFIG_NO_COLLATION
1866