• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/stringpiece.h"
26 #include "unicode/tblcoll.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uniset.h"
29 #include "unicode/unistr.h"
30 #include "unicode/usetiter.h"
31 #include "unicode/ustring.h"
32 #include "charstr.h"
33 #include "cmemory.h"
34 #include "collation.h"
35 #include "collationdata.h"
36 #include "collationfcd.h"
37 #include "collationiterator.h"
38 #include "collationroot.h"
39 #include "collationrootelements.h"
40 #include "collationruleparser.h"
41 #include "collationweights.h"
42 #include "cstring.h"
43 #include "intltest.h"
44 #include "normalizer2impl.h"
45 #include "ucbuf.h"
46 #include "uhash.h"
47 #include "uitercollationiterator.h"
48 #include "utf16collationiterator.h"
49 #include "utf8collationiterator.h"
50 #include "uvectr32.h"
51 #include "uvectr64.h"
52 #include "writesrc.h"
53 
54 class CodePointIterator;
55 
56 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
57 
58 class CollationTest : public IntlTest {
59 public:
CollationTest()60     CollationTest()
61             : fcd(NULL), nfd(NULL),
62               fileLineNumber(0),
63               coll(NULL) {}
64 
~CollationTest()65     ~CollationTest() {
66         delete coll;
67     }
68 
69     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
70 
71     void TestMinMax();
72     void TestImplicits();
73     void TestNulTerminated();
74     void TestIllegalUTF8();
75     void TestShortFCDData();
76     void TestFCD();
77     void TestCollationWeights();
78     void TestRootElements();
79     void TestTailoredElements();
80     void TestDataDriven();
81 
82 private:
83     void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
84     void checkAllocWeights(CollationWeights &cw,
85                            uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
86                            int32_t someLength, int32_t minCount);
87 
88     static UnicodeString printSortKey(const uint8_t *p, int32_t length);
89     static UnicodeString printCollationKey(const CollationKey &key);
90 
91     // Helpers & fields for data-driven test.
isCROrLF(UChar c)92     static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)93     static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)94     static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
skipSpaces(int32_t i)95     int32_t skipSpaces(int32_t i) {
96         while(isSpace(fileLine[i])) { ++i; }
97         return i;
98     }
99 
100     UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
101     void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
102     Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
103     void parseAndSetAttribute(IcuTestErrorCode &errorCode);
104     void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
105     void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
106     void setRootCollator(IcuTestErrorCode &errorCode);
107     void setLocaleCollator(IcuTestErrorCode &errorCode);
108 
109     UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
110 
111     UBool getSortKeyParts(const UChar *s, int32_t length,
112                           CharString &dest, int32_t partSize,
113                           IcuTestErrorCode &errorCode);
114     UBool getCollationKey(const char *norm, const UnicodeString &line,
115                           const UChar *s, int32_t length,
116                           CollationKey &key, IcuTestErrorCode &errorCode);
117     UBool getMergedCollationKey(const UChar *s, int32_t length,
118                                 CollationKey &key, IcuTestErrorCode &errorCode);
119     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
120                           const UnicodeString &prevString, const UnicodeString &s,
121                           UCollationResult expectedOrder, Collation::Level expectedLevel,
122                           IcuTestErrorCode &errorCode);
123     void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
124 
125     const Normalizer2 *fcd, *nfd;
126     UnicodeString fileLine;
127     int32_t fileLineNumber;
128     UnicodeString fileTestName;
129     Collator *coll;
130 };
131 
createCollationTest()132 extern IntlTest *createCollationTest() {
133     return new CollationTest();
134 }
135 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)136 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
137     if(exec) {
138         logln("TestSuite CollationTest: ");
139     }
140     TESTCASE_AUTO_BEGIN;
141     TESTCASE_AUTO(TestMinMax);
142     TESTCASE_AUTO(TestImplicits);
143     TESTCASE_AUTO(TestNulTerminated);
144     TESTCASE_AUTO(TestIllegalUTF8);
145     TESTCASE_AUTO(TestShortFCDData);
146     TESTCASE_AUTO(TestFCD);
147     TESTCASE_AUTO(TestCollationWeights);
148     TESTCASE_AUTO(TestRootElements);
149     TESTCASE_AUTO(TestTailoredElements);
150     TESTCASE_AUTO(TestDataDriven);
151     TESTCASE_AUTO_END;
152 }
153 
TestMinMax()154 void CollationTest::TestMinMax() {
155     IcuTestErrorCode errorCode(*this, "TestMinMax");
156 
157     setRootCollator(errorCode);
158     if(errorCode.isFailure()) {
159         errorCode.reset();
160         return;
161     }
162     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
163     if(rbc == NULL) {
164         errln("the root collator is not a RuleBasedCollator");
165         return;
166     }
167 
168     static const UChar s[2] = { 0xfffe, 0xffff };
169     UVector64 ces(errorCode);
170     rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
171     errorCode.assertSuccess();
172     if(ces.size() != 2) {
173         errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
174         return;
175     }
176     int64_t ce = ces.elementAti(0);
177     int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
178     if(ce != expected) {
179         errln("CE(U+fffe)=%04lx != 02..", (long)ce);
180     }
181 
182     ce = ces.elementAti(1);
183     expected = Collation::makeCE(Collation::MAX_PRIMARY);
184     if(ce != expected) {
185         errln("CE(U+ffff)=%04lx != max..", (long)ce);
186     }
187 }
188 
TestImplicits()189 void CollationTest::TestImplicits() {
190     IcuTestErrorCode errorCode(*this, "TestImplicits");
191 
192     const CollationData *cd = CollationRoot::getData(errorCode);
193     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
194         return;
195     }
196 
197     // Implicit primary weights should be assigned for the following sets,
198     // and sort in ascending order by set and then code point.
199     // See http://www.unicode.org/reports/tr10/#Implicit_Weights
200 
201     // core Han Unified Ideographs
202     UnicodeSet coreHan("[\\p{unified_ideograph}&"
203                             "[\\p{Block=CJK_Unified_Ideographs}"
204                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
205                        errorCode);
206     // all other Unified Han ideographs
207     UnicodeSet otherHan("[\\p{unified ideograph}-"
208                             "[\\p{Block=CJK_Unified_Ideographs}"
209                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
210                         errorCode);
211     UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
212     unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
213 
214     // Starting with CLDR 26/ICU 54, the root Han order may instead be
215     // the Unihan radical-stroke order.
216     // The tests should pass either way, so we only test the order of a small set of Han characters
217     // whose radical-stroke order is the same as their code point order.
218     UnicodeSet someHanInCPOrder(
219             "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
220             "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
221             errorCode);
222     UnicodeSet inOrder(someHanInCPOrder);
223     inOrder.addAll(unassigned).freeze();
224     if(errorCode.errIfFailureAndReset("UnicodeSet")) {
225         return;
226     }
227     const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
228     UChar32 prev = 0;
229     uint32_t prevPrimary = 0;
230     UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
231     for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
232         LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
233         while(iter->next()) {
234             UChar32 c = iter->getCodepoint();
235             UnicodeString s(c);
236             ci.setText(s.getBuffer(), s.getBuffer() + s.length());
237             int64_t ce = ci.nextCE(errorCode);
238             int64_t ce2 = ci.nextCE(errorCode);
239             if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
240                 return;
241             }
242             if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
243                 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
244                 continue;
245             }
246             if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
247                 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
248                       (long)c, (long)(ce & 0xffffffff));
249                 continue;
250             }
251             uint32_t primary = (uint32_t)(ce >> 32);
252             if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
253                 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
254                       (long)c, (long)primary, (long)prev, (long)prevPrimary);
255             }
256             prev = c;
257             prevPrimary = primary;
258         }
259     }
260 }
261 
TestNulTerminated()262 void CollationTest::TestNulTerminated() {
263     IcuTestErrorCode errorCode(*this, "TestNulTerminated");
264     const CollationData *data = CollationRoot::getData(errorCode);
265     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
266         return;
267     }
268 
269     static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
270 
271     UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
272     UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
273     for(int32_t i = 0;; ++i) {
274         int64_t ce1 = ci1.nextCE(errorCode);
275         int64_t ce2 = ci2.nextCE(errorCode);
276         if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
277             return;
278         }
279         if(ce1 != ce2) {
280             errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
281             break;
282         }
283         if(ce1 == Collation::NO_CE) { break; }
284     }
285 }
286 
TestIllegalUTF8()287 void CollationTest::TestIllegalUTF8() {
288     IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
289 
290     setRootCollator(errorCode);
291     if(errorCode.isFailure()) {
292         errorCode.reset();
293         return;
294     }
295     coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
296 
297     static const StringPiece strings[] = {
298         // string with U+FFFD == illegal byte sequence
299         u8"a\uFFFDz",                   "a\x80z",  // trail byte
300         u8"a\uFFFD\uFFFDz",             "a\xc1\x81z",  // non-shortest form
301         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xe0\x82\x83z",  // non-shortest form
302         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
303         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
304         u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz",  // non-shortest form
305         u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
306     };
307 
308     for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
309         StringPiece fffd(strings[i]);
310         StringPiece illegal(strings[i + 1]);
311         UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
312         if(order != UCOL_EQUAL) {
313             errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
314                   (int)i, order);
315         }
316     }
317 }
318 
319 namespace {
320 
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)321 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
322     for(UChar32 c = 0x10000; c < 0x110000;) {
323         UChar32 next = c + 0x400;
324         if(src.containsSome(c, next - 1)) {
325             dest.add(U16_LEAD(c));
326         }
327         c = next;
328     }
329 }
330 
331 }  // namespace
332 
TestShortFCDData()333 void CollationTest::TestShortFCDData() {
334     // See CollationFCD class comments.
335     IcuTestErrorCode errorCode(*this, "TestShortFCDData");
336     UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
337     errorCode.assertSuccess();
338     expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
339     addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
340     UnicodeSet lccc;  // actual
341     for(UChar32 c = 0; c <= 0xffff; ++c) {
342         if(CollationFCD::hasLccc(c)) { lccc.add(c); }
343     }
344     UnicodeSet diff(expectedLccc);
345     diff.removeAll(lccc);
346     diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
347     UnicodeString empty("[]");
348     UnicodeString diffString;
349     diff.toPattern(diffString, TRUE);
350     assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
351     diff = lccc;
352     diff.removeAll(expectedLccc);
353     diff.toPattern(diffString, TRUE);
354     assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
355 
356     UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
357     if (errorCode.isSuccess()) {
358         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
359         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
360         UnicodeSet tccc;  // actual
361         for(UChar32 c = 0; c <= 0xffff; ++c) {
362             if(CollationFCD::hasTccc(c)) { tccc.add(c); }
363         }
364         diff = expectedTccc;
365         diff.removeAll(tccc);
366         diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
367         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
368         diff = tccc;
369         diff.removeAll(expectedTccc);
370         diff.toPattern(diffString, TRUE);
371         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
372     }
373 }
374 
375 class CodePointIterator {
376 public:
CodePointIterator(const UChar32 * cp,int32_t length)377     CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()378     void resetToStart() { pos = 0; }
next()379     UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()380     UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const381     int32_t getLength() const { return length; }
getIndex() const382     int getIndex() const { return (int)pos; }
383 private:
384     const UChar32 *cp;
385     int32_t length;
386     int32_t pos;
387 };
388 
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)389 void CollationTest::checkFCD(const char *name,
390                              CollationIterator &ci, CodePointIterator &cpi) {
391     IcuTestErrorCode errorCode(*this, "checkFCD");
392 
393     // Iterate forward to the limit.
394     for(;;) {
395         UChar32 c1 = ci.nextCodePoint(errorCode);
396         UChar32 c2 = cpi.next();
397         if(c1 != c2) {
398             errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
399                   name, (long)c1, (long)c2, cpi.getIndex());
400             return;
401         }
402         if(c1 < 0) { break; }
403     }
404 
405     // Iterate backward most of the way.
406     for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
407         UChar32 c1 = ci.previousCodePoint(errorCode);
408         UChar32 c2 = cpi.previous();
409         if(c1 != c2) {
410             errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
411                   name, (long)c1, (long)c2, cpi.getIndex());
412             return;
413         }
414     }
415 
416     // Forward again.
417     for(;;) {
418         UChar32 c1 = ci.nextCodePoint(errorCode);
419         UChar32 c2 = cpi.next();
420         if(c1 != c2) {
421             errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
422                   name, (long)c1, (long)c2, cpi.getIndex());
423             return;
424         }
425         if(c1 < 0) { break; }
426     }
427 
428     // Iterate backward to the start.
429     for(;;) {
430         UChar32 c1 = ci.previousCodePoint(errorCode);
431         UChar32 c2 = cpi.previous();
432         if(c1 != c2) {
433             errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
434                   name, (long)c1, (long)c2, cpi.getIndex());
435             return;
436         }
437         if(c1 < 0) { break; }
438     }
439 }
440 
TestFCD()441 void CollationTest::TestFCD() {
442     IcuTestErrorCode errorCode(*this, "TestFCD");
443     const CollationData *data = CollationRoot::getData(errorCode);
444     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
445         return;
446     }
447 
448     // Input string, not FCD, NUL-terminated.
449     static const UChar s[] = {
450         0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
451         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
452         0x327, 0x308,  // ccc=202, 230
453         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
454         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
455         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
456         0xac01,
457         0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
458         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
459         0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
460         0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
461         0x4e00, 0xf81,
462         0
463     };
464     // Expected code points.
465     static const UChar32 cp[] = {
466         0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
467         0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
468         0x1D15F, 0x1D16D,
469         0xac01,
470         0x63, 0x327, 0x1D165, 0x1D16D,
471         0x61,
472         0xf71, 0xf71, 0xf72, 0xf74, 0x301,
473         0x4e00, 0xf71, 0xf80
474     };
475 
476     FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
477     if(errorCode.errIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
478         return;
479     }
480     CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
481     checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
482 
483     cpi.resetToStart();
484     std::string utf8;
485     UnicodeString(s).toUTF8String(utf8);
486     FCDUTF8CollationIterator u8ci(data, FALSE,
487                                   reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
488     if(errorCode.errIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
489         return;
490     }
491     checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
492 
493     cpi.resetToStart();
494     UCharIterator iter;
495     uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL
496     FCDUIterCollationIterator uici(data, FALSE, iter, 0);
497     if(errorCode.errIfFailureAndReset("FCDUIterCollationIterator constructor")) {
498         return;
499     }
500     checkFCD("FCDUIterCollationIterator", uici, cpi);
501 }
502 
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)503 void CollationTest::checkAllocWeights(CollationWeights &cw,
504                                       uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
505                                       int32_t someLength, int32_t minCount) {
506     if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
507         errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
508               (long)lowerLimit, (long)upperLimit, (long)n);
509         return;
510     }
511     uint32_t previous = lowerLimit;
512     int32_t count = 0;  // number of weights that have someLength
513     for(int32_t i = 0; i < n; ++i) {
514         uint32_t w = cw.nextWeight();
515         if(w == 0xffffffff) {
516             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
517                   "returns only %ld weights",
518                   (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
519             return;
520         }
521         if(!(previous < w && w < upperLimit)) {
522             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
523                   "number %ld -> %lx not between %lx and %lx",
524                   (long)lowerLimit, (long)upperLimit, (long)n,
525                   (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
526             return;
527         }
528         if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
529     }
530     if(count < minCount) {
531         errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
532               "returns only %ld < %ld weights of length %d",
533               (long)lowerLimit, (long)upperLimit, (long)n,
534               (long)count, (long)minCount, (int)someLength);
535     }
536 }
537 
TestCollationWeights()538 void CollationTest::TestCollationWeights() {
539     CollationWeights cw;
540 
541     // Non-compressible primaries use 254 second bytes 02..FF.
542     logln("CollationWeights.initForPrimary(non-compressible)");
543     cw.initForPrimary(FALSE);
544     // Expect 1 weight 11 and 254 weights 12xx.
545     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
546     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
547     // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
548     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
549     // Expect 254 two-byte weights from the ranges 10ff and 11xx.
550     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
551     // Expect 254^2=64516 three-byte weights.
552     // During computation, there should be 3 three-byte ranges
553     // 10ffff, 11xxxx, 120202.
554     // The middle one should be split 64515:1,
555     // and the newly-split-off range and the last ranged lengthened.
556     checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
557     // Expect weights 1102 & 1103.
558     checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
559     // Expect weights 102102 & 102103.
560     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
561 
562     // Compressible primaries use 251 second bytes 04..FE.
563     logln("CollationWeights.initForPrimary(compressible)");
564     cw.initForPrimary(TRUE);
565     // Expect 1 weight 11 and 251 weights 12xx.
566     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
567     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
568     // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
569     checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
570     // Expect weights 1104 & 1105.
571     checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
572     // Expect weights 102102 & 102103.
573     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
574 
575     // Secondary and tertiary weights use only bytes 3 & 4.
576     logln("CollationWeights.initForSecondary()");
577     cw.initForSecondary();
578     // Expect weights fbxx and all four fc..ff.
579     checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
580 
581     logln("CollationWeights.initForTertiary()");
582     cw.initForTertiary();
583     // Expect weights 3dxx and both 3e & 3f.
584     checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
585 }
586 
587 namespace {
588 
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)589 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
590                 uint32_t p, uint32_t s, uint32_t ctq) {
591     uint32_t p1 = p >> 24;
592     uint32_t p2 = (p >> 16) & 0xff;
593     uint32_t p3 = (p >> 8) & 0xff;
594     uint32_t p4 = p & 0xff;
595     uint32_t s1 = s >> 8;
596     uint32_t s2 = s & 0xff;
597     // ctq = Case, Tertiary, Quaternary
598     uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
599     uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
600     uint32_t t1 = t >> 8;
601     uint32_t t2 = t & 0xff;
602     uint32_t q = ctq & Collation::QUATERNARY_MASK;
603     // No leading zero bytes.
604     if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
605         return FALSE;
606     }
607     // No intermediate zero bytes.
608     if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
609         return FALSE;
610     }
611     if(p2 != 0 && p3 == 0 && p4 != 0) {
612         return FALSE;
613     }
614     // Minimum & maximum lead bytes.
615     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
616             s1 == Collation::LEVEL_SEPARATOR_BYTE ||
617             t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
618         return FALSE;
619     }
620     if(c > 2) {
621         return FALSE;
622     }
623     // The valid byte range for the second primary byte depends on compressibility.
624     if(p2 != 0) {
625         if(data.isCompressibleLeadByte(p1)) {
626             if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
627                     Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
628                 return FALSE;
629             }
630         } else {
631             if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
632                 return FALSE;
633             }
634         }
635     }
636     // Other bytes just need to avoid the level separator.
637     // Trailing zeros are ok.
638     U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
639     if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
640             s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
641         return FALSE;
642     }
643     // Well-formed CEs.
644     if(p == 0) {
645         if(s == 0) {
646             if(t == 0) {
647                 // Completely ignorable CE.
648                 // Quaternary CEs are not supported.
649                 if(c != 0 || q != 0) {
650                     return FALSE;
651                 }
652             } else {
653                 // Tertiary CE.
654                 if(t < re.getTertiaryBoundary() || c != 2) {
655                     return FALSE;
656                 }
657             }
658         } else {
659             // Secondary CE.
660             if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
661                 return FALSE;
662             }
663         }
664     } else {
665         // Primary CE.
666         if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
667                 s >= re.getSecondaryBoundary()) {
668             return FALSE;
669         }
670         if(t == 0 || t >= re.getTertiaryBoundary()) {
671             return FALSE;
672         }
673     }
674     return TRUE;
675 }
676 
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)677 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
678     uint32_t p = (uint32_t)(ce >> 32);
679     uint32_t secTer = (uint32_t)ce;
680     return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
681 }
682 
683 class RootElementsIterator {
684 public:
RootElementsIterator(const CollationData & root)685     RootElementsIterator(const CollationData &root)
686             : data(root),
687               elements(root.rootElements), length(root.rootElementsLength),
688               pri(0), secTer(0),
689               index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
690 
next()691     UBool next() {
692         if(index >= length) { return FALSE; }
693         uint32_t p = elements[index];
694         if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
695         if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
696             ++index;
697             secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
698             return TRUE;
699         }
700         if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
701             // End of a range, enumerate the primaries in the range.
702             int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
703             p &= 0xffffff00;
704             if(pri == p) {
705                 // Finished the range, return the next CE after it.
706                 ++index;
707                 return next();
708             }
709             U_ASSERT(pri < p);
710             // Return the next primary in this range.
711             UBool isCompressible = data.isCompressiblePrimary(pri);
712             if((pri & 0xffff) == 0) {
713                 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
714             } else {
715                 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
716             }
717             return TRUE;
718         }
719         // Simple primary CE.
720         ++index;
721         pri = p;
722         // Does this have an explicit below-common sec/ter unit,
723         // or does it imply a common one?
724         if(index == length) {
725             secTer = Collation::COMMON_SEC_AND_TER_CE;
726         } else {
727             secTer = elements[index];
728             if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
729                 // No sec/ter delta.
730                 secTer = Collation::COMMON_SEC_AND_TER_CE;
731             } else {
732                 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
733                 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
734                     // Implied sec/ter.
735                     secTer = Collation::COMMON_SEC_AND_TER_CE;
736                 } else {
737                     // Explicit sec/ter below common/common.
738                     ++index;
739                 }
740             }
741         }
742         return TRUE;
743     }
744 
getPrimary() const745     uint32_t getPrimary() const { return pri; }
getSecTer() const746     uint32_t getSecTer() const { return secTer; }
747 
748 private:
749     const CollationData &data;
750     const uint32_t *elements;
751     int32_t length;
752 
753     uint32_t pri;
754     uint32_t secTer;
755     int32_t index;
756 };
757 
758 }  // namespace
759 
TestRootElements()760 void CollationTest::TestRootElements() {
761     IcuTestErrorCode errorCode(*this, "TestRootElements");
762     const CollationData *root = CollationRoot::getData(errorCode);
763     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
764         return;
765     }
766     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
767     RootElementsIterator iter(*root);
768 
769     // We check each root CE for validity,
770     // and we also verify that there is a tailoring gap between each two CEs.
771     CollationWeights cw1c;  // compressible primary weights
772     CollationWeights cw1u;  // uncompressible primary weights
773     CollationWeights cw2;
774     CollationWeights cw3;
775 
776     cw1c.initForPrimary(TRUE);
777     cw1u.initForPrimary(FALSE);
778     cw2.initForSecondary();
779     cw3.initForTertiary();
780 
781     // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
782     // nor the special merge-separator CE for U+FFFE.
783     uint32_t prevPri = 0;
784     uint32_t prevSec = 0;
785     uint32_t prevTer = 0;
786     while(iter.next()) {
787         uint32_t pri = iter.getPrimary();
788         uint32_t secTer = iter.getSecTer();
789         // CollationRootElements CEs must have 0 case and quaternary bits.
790         if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
791             errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
792                   (long)pri, (long)secTer);
793         }
794         uint32_t sec = secTer >> 16;
795         uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
796         uint32_t ctq = ter;
797         if(pri == 0 && sec == 0 && ter != 0) {
798             // Tertiary CEs must have uppercase bits,
799             // but they are not stored in the CollationRootElements.
800             ctq |= 0x8000;
801         }
802         if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
803             errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
804         } else {
805             if(pri != prevPri) {
806                 uint32_t newWeight = 0;
807                 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
808                     // There is currently no tailoring gap after primary ignorables,
809                     // and we forbid tailoring after U+FFFD and U+FFFF.
810                 } else if(root->isCompressiblePrimary(prevPri)) {
811                     if(!cw1c.allocWeights(prevPri, pri, 1)) {
812                         errln("no primary/compressible tailoring gap between %08lx and %08lx",
813                               (long)prevPri, (long)pri);
814                     } else {
815                         newWeight = cw1c.nextWeight();
816                     }
817                 } else {
818                     if(!cw1u.allocWeights(prevPri, pri, 1)) {
819                         errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
820                               (long)prevPri, (long)pri);
821                     } else {
822                         newWeight = cw1u.nextWeight();
823                     }
824                 }
825                 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
826                     errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
827                           (long)prevPri, (long)newWeight, (long)pri);
828                 }
829             } else if(sec != prevSec) {
830                 uint32_t lowerLimit =
831                     prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
832                 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
833                     errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
834                 } else {
835                     uint32_t newWeight = cw2.nextWeight();
836                     if(!(prevSec < newWeight && newWeight < sec)) {
837                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
838                               (long)lowerLimit, (long)newWeight, (long)sec);
839                     }
840                 }
841             } else if(ter != prevTer) {
842                 uint32_t lowerLimit =
843                     prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
844                 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
845                     errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
846                 } else {
847                     uint32_t newWeight = cw3.nextWeight();
848                     if(!(prevTer < newWeight && newWeight < ter)) {
849                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
850                               (long)lowerLimit, (long)newWeight, (long)ter);
851                     }
852                 }
853             } else {
854                 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
855             }
856         }
857         prevPri = pri;
858         prevSec = sec;
859         prevTer = ter;
860     }
861 }
862 
TestTailoredElements()863 void CollationTest::TestTailoredElements() {
864     IcuTestErrorCode errorCode(*this, "TestTailoredElements");
865     const CollationData *root = CollationRoot::getData(errorCode);
866     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
867         return;
868     }
869     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
870 
871     UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
872     if(errorCode.errIfFailureAndReset("failed to create a hash table")) {
873         return;
874     }
875     uhash_setKeyDeleter(prevLocales, uprv_free);
876     // TestRootElements() tests the root collator which does not have tailorings.
877     uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
878     uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
879     uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
880 
881     UVector64 ces(errorCode);
882     LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
883     U_ASSERT(locales.isValid());
884     const char *localeID = "root";
885     do {
886         Locale locale(localeID);
887         LocalPointer<StringEnumeration> types(
888                 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
889         errorCode.assertSuccess();
890         const char *type;  // first: default type
891         while((type = types->next(NULL, errorCode)) != NULL) {
892             if(strncmp(type, "private-", 8) == 0) {
893                 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
894                         localeID, type);
895             }
896             Locale localeWithType(locale);
897             localeWithType.setKeywordValue("collation", type, errorCode);
898             errorCode.assertSuccess();
899             LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
900             if(errorCode.errIfFailureAndReset("Collator::createInstance(%s)",
901                                               localeWithType.getName())) {
902                 continue;
903             }
904             Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
905             if(uhash_geti(prevLocales, actual.getName()) != 0) {
906                 continue;
907             }
908             uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
909             errorCode.assertSuccess();
910             logln("TestTailoredElements(): requested %s -> actual %s",
911                   localeWithType.getName(), actual.getName());
912             RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
913             if(rbc == NULL) {
914                 continue;
915             }
916             // Note: It would be better to get tailored strings such that we can
917             // identify the prefix, and only get the CEs for the prefix+string,
918             // not also for the prefix.
919             // There is currently no API for that.
920             // It would help in an unusual case where a contraction starting in the prefix
921             // extends past its end, and we do not see the intended mapping.
922             // For example, for a mapping p|st, if there is also a contraction ps,
923             // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
924             LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
925             errorCode.assertSuccess();
926             UnicodeSetIterator iter(*tailored);
927             while(iter.next()) {
928                 const UnicodeString &s = iter.getString();
929                 ces.removeAllElements();
930                 rbc->internalGetCEs(s, ces, errorCode);
931                 errorCode.assertSuccess();
932                 for(int32_t i = 0; i < ces.size(); ++i) {
933                     int64_t ce = ces.elementAti(i);
934                     if(!isValidCE(rootElements, *root, ce)) {
935                         errln("invalid tailored CE %016llx at CE index %d from string:",
936                               (long long)ce, (int)i);
937                         infoln(prettify(s));
938                     }
939                 }
940             }
941         }
942     } while((localeID = locales->next(NULL, errorCode)) != NULL);
943     uhash_close(prevLocales);
944 }
945 
printSortKey(const uint8_t * p,int32_t length)946 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
947     UnicodeString s;
948     for(int32_t i = 0; i < length; ++i) {
949         if(i > 0) { s.append((UChar)0x20); }
950         uint8_t b = p[i];
951         if(b == 0) {
952             s.append((UChar)0x2e);  // period
953         } else if(b == 1) {
954             s.append((UChar)0x7c);  // vertical bar
955         } else {
956             appendHex(b, 2, s);
957         }
958     }
959     return s;
960 }
961 
printCollationKey(const CollationKey & key)962 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
963     int32_t length;
964     const uint8_t *p = key.getByteArray(length);
965     return printSortKey(p, length);
966 }
967 
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)968 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
969     for(;;) {
970         int32_t lineLength;
971         const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
972         if(line == NULL || errorCode.isFailure()) {
973             fileLine.remove();
974             return FALSE;
975         }
976         ++fileLineNumber;
977         // Strip trailing CR/LF, comments, and spaces.
978         const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
979         if(comment != NULL) {
980             lineLength = (int32_t)(comment - line);
981         } else {
982             while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
983         }
984         while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
985         if(lineLength != 0) {
986             fileLine.setTo(FALSE, line, lineLength);
987             return TRUE;
988         }
989         // Empty line, continue.
990     }
991 }
992 
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)993 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
994                                 UErrorCode &errorCode) {
995     int32_t length = fileLine.length();
996     int32_t i;
997     for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
998     int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
999     if(pipeIndex >= 0) {
1000         prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1001         if(prefix.isEmpty()) {
1002             errln("empty prefix on line %d", (int)fileLineNumber);
1003             infoln(fileLine);
1004             errorCode = U_PARSE_ERROR;
1005             return;
1006         }
1007         start = pipeIndex + 1;
1008     } else {
1009         prefix.remove();
1010     }
1011     s = fileLine.tempSubStringBetween(start, i).unescape();
1012     if(s.isEmpty()) {
1013         errln("empty string on line %d", (int)fileLineNumber);
1014         infoln(fileLine);
1015         errorCode = U_PARSE_ERROR;
1016         return;
1017     }
1018     start = i;
1019 }
1020 
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1021 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1022     Collation::Level relation;
1023     int32_t start;
1024     if(fileLine[0] == 0x3c) {  // <
1025         UChar second = fileLine[1];
1026         start = 2;
1027         switch(second) {
1028         case 0x31:  // <1
1029             relation = Collation::PRIMARY_LEVEL;
1030             break;
1031         case 0x32:  // <2
1032             relation = Collation::SECONDARY_LEVEL;
1033             break;
1034         case 0x33:  // <3
1035             relation = Collation::TERTIARY_LEVEL;
1036             break;
1037         case 0x34:  // <4
1038             relation = Collation::QUATERNARY_LEVEL;
1039             break;
1040         case 0x63:  // <c
1041             relation = Collation::CASE_LEVEL;
1042             break;
1043         case 0x69:  // <i
1044             relation = Collation::IDENTICAL_LEVEL;
1045             break;
1046         default:  // just <
1047             relation = Collation::NO_LEVEL;
1048             start = 1;
1049             break;
1050         }
1051     } else if(fileLine[0] == 0x3d) {  // =
1052         relation = Collation::ZERO_LEVEL;
1053         start = 1;
1054     } else {
1055         start = 0;
1056     }
1057     if(start == 0 || !isSpace(fileLine[start])) {
1058         errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1059         infoln(fileLine);
1060         errorCode.set(U_PARSE_ERROR);
1061         return Collation::NO_LEVEL;
1062     }
1063     start = skipSpaces(start);
1064     UnicodeString prefix;
1065     parseString(start, prefix, s, errorCode);
1066     if(errorCode.isSuccess() && !prefix.isEmpty()) {
1067         errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1068         infoln(fileLine);
1069         errorCode.set(U_PARSE_ERROR);
1070         return Collation::NO_LEVEL;
1071     }
1072     if(start < fileLine.length()) {
1073         errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1074         infoln(fileLine);
1075         errorCode.set(U_PARSE_ERROR);
1076         return Collation::NO_LEVEL;
1077     }
1078     return relation;
1079 }
1080 
1081 static const struct {
1082     const char *name;
1083     UColAttribute attr;
1084 } attributes[] = {
1085     { "backwards", UCOL_FRENCH_COLLATION },
1086     { "alternate", UCOL_ALTERNATE_HANDLING },
1087     { "caseFirst", UCOL_CASE_FIRST },
1088     { "caseLevel", UCOL_CASE_LEVEL },
1089     // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1090     { "strength", UCOL_STRENGTH },
1091     // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1092     { "numeric", UCOL_NUMERIC_COLLATION }
1093 };
1094 
1095 static const struct {
1096     const char *name;
1097     UColAttributeValue value;
1098 } attributeValues[] = {
1099     { "default", UCOL_DEFAULT },
1100     { "primary", UCOL_PRIMARY },
1101     { "secondary", UCOL_SECONDARY },
1102     { "tertiary", UCOL_TERTIARY },
1103     { "quaternary", UCOL_QUATERNARY },
1104     { "identical", UCOL_IDENTICAL },
1105     { "off", UCOL_OFF },
1106     { "on", UCOL_ON },
1107     { "shifted", UCOL_SHIFTED },
1108     { "non-ignorable", UCOL_NON_IGNORABLE },
1109     { "lower", UCOL_LOWER_FIRST },
1110     { "upper", UCOL_UPPER_FIRST }
1111 };
1112 
parseAndSetAttribute(IcuTestErrorCode & errorCode)1113 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1114     // Parse attributes even if the Collator could not be created,
1115     // in order to report syntax errors.
1116     int32_t start = skipSpaces(1);
1117     int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1118     if(equalPos < 0) {
1119         if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1120             parseAndSetReorderCodes(start + 7, errorCode);
1121             return;
1122         }
1123         errln("missing '=' on line %d", (int)fileLineNumber);
1124         infoln(fileLine);
1125         errorCode.set(U_PARSE_ERROR);
1126         return;
1127     }
1128 
1129     UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1130     UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1131     if(attrString == UNICODE_STRING("maxVariable", 11)) {
1132         UColReorderCode max;
1133         if(valueString == UNICODE_STRING("space", 5)) {
1134             max = UCOL_REORDER_CODE_SPACE;
1135         } else if(valueString == UNICODE_STRING("punct", 5)) {
1136             max = UCOL_REORDER_CODE_PUNCTUATION;
1137         } else if(valueString == UNICODE_STRING("symbol", 6)) {
1138             max = UCOL_REORDER_CODE_SYMBOL;
1139         } else if(valueString == UNICODE_STRING("currency", 8)) {
1140             max = UCOL_REORDER_CODE_CURRENCY;
1141         } else {
1142             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1143             infoln(fileLine);
1144             errorCode.set(U_PARSE_ERROR);
1145             return;
1146         }
1147         if(coll != NULL) {
1148             coll->setMaxVariable(max, errorCode);
1149             if(errorCode.isFailure()) {
1150                 errln("setMaxVariable() failed on line %d: %s",
1151                       (int)fileLineNumber, errorCode.errorName());
1152                 infoln(fileLine);
1153                 return;
1154             }
1155         }
1156         fileLine.remove();
1157         return;
1158     }
1159 
1160     UColAttribute attr;
1161     for(int32_t i = 0;; ++i) {
1162         if(i == UPRV_LENGTHOF(attributes)) {
1163             errln("invalid attribute name on line %d", (int)fileLineNumber);
1164             infoln(fileLine);
1165             errorCode.set(U_PARSE_ERROR);
1166             return;
1167         }
1168         if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1169             attr = attributes[i].attr;
1170             break;
1171         }
1172     }
1173 
1174     UColAttributeValue value;
1175     for(int32_t i = 0;; ++i) {
1176         if(i == UPRV_LENGTHOF(attributeValues)) {
1177             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1178             infoln(fileLine);
1179             errorCode.set(U_PARSE_ERROR);
1180             return;
1181         }
1182         if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1183             value = attributeValues[i].value;
1184             break;
1185         }
1186     }
1187 
1188     if(coll != NULL) {
1189         coll->setAttribute(attr, value, errorCode);
1190         if(errorCode.isFailure()) {
1191             errln("illegal attribute=value combination on line %d: %s",
1192                   (int)fileLineNumber, errorCode.errorName());
1193             infoln(fileLine);
1194             return;
1195         }
1196     }
1197     fileLine.remove();
1198 }
1199 
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1200 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1201     UVector32 reorderCodes(errorCode);
1202     while(start < fileLine.length()) {
1203         start = skipSpaces(start);
1204         int32_t limit = start;
1205         while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1206         CharString name;
1207         name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1208         int32_t code = CollationRuleParser::getReorderCode(name.data());
1209         if(code < 0) {
1210             if(uprv_stricmp(name.data(), "default") == 0) {
1211                 code = UCOL_REORDER_CODE_DEFAULT;  // -1
1212             } else {
1213                 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1214                 infoln(fileLine);
1215                 errorCode.set(U_PARSE_ERROR);
1216                 return;
1217             }
1218         }
1219         reorderCodes.addElement(code, errorCode);
1220         start = limit;
1221     }
1222     if(coll != NULL) {
1223         coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1224         if(errorCode.isFailure()) {
1225             errln("setReorderCodes() failed on line %d: %s",
1226                   (int)fileLineNumber, errorCode.errorName());
1227             infoln(fileLine);
1228             return;
1229         }
1230     }
1231     fileLine.remove();
1232 }
1233 
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1234 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1235     UnicodeString rules;
1236     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1237         rules.append(fileLine.unescape());
1238     }
1239     if(errorCode.isFailure()) { return; }
1240     logln(rules);
1241 
1242     UParseError parseError;
1243     UnicodeString reason;
1244     delete coll;
1245     coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1246     if(coll == NULL) {
1247         errln("unable to allocate a new collator");
1248         errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1249         return;
1250     }
1251     if(errorCode.isFailure()) {
1252         dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1253         infoln(UnicodeString("  reason: ") + reason);
1254         if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1255         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1256             infoln(UnicodeString("  snippet: ...") +
1257                 parseError.preContext + "(!)" + parseError.postContext + "...");
1258         }
1259         delete coll;
1260         coll = NULL;
1261         errorCode.reset();
1262     } else {
1263         assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1264                      UnicodeString(), reason);
1265     }
1266 }
1267 
setRootCollator(IcuTestErrorCode & errorCode)1268 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1269     if(errorCode.isFailure()) { return; }
1270     delete coll;
1271     coll = Collator::createInstance(Locale::getRoot(), errorCode);
1272     if(errorCode.isFailure()) {
1273         dataerrln("unable to create a root collator");
1274         return;
1275     }
1276 }
1277 
setLocaleCollator(IcuTestErrorCode & errorCode)1278 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1279     if(errorCode.isFailure()) { return; }
1280     delete coll;
1281     coll = NULL;
1282     int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1283     if(at >= 0) {
1284         fileLine.setCharAt(at, (UChar)0x2a);  // *
1285     }
1286     CharString localeID;
1287     localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1288     if(at >= 0) {
1289         localeID.data()[at - 9] = '@';
1290     }
1291     Locale locale(localeID.data());
1292     if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1293         errln("invalid language tag on line %d", (int)fileLineNumber);
1294         infoln(fileLine);
1295         if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1296         return;
1297     }
1298 
1299     logln("creating a collator for locale ID %s", locale.getName());
1300     coll = Collator::createInstance(locale, errorCode);
1301     if(errorCode.isFailure()) {
1302         dataerrln("unable to create a collator for locale %s on line %d",
1303                   locale.getName(), (int)fileLineNumber);
1304         infoln(fileLine);
1305         delete coll;
1306         coll = NULL;
1307         errorCode.reset();
1308     }
1309 }
1310 
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1311 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1312     if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1313     // In some sequences with Tibetan composite vowel signs,
1314     // even if the string passes the FCD check,
1315     // those composites must be decomposed.
1316     // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1317     int32_t index = 0;
1318     while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1319         if(++index < s.length()) {
1320             UChar c = s[index];
1321             if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1322         }
1323     }
1324     return FALSE;
1325 }
1326 
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1327 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1328                                      CharString &dest, int32_t partSize,
1329                                      IcuTestErrorCode &errorCode) {
1330     if(errorCode.isFailure()) { return FALSE; }
1331     uint8_t part[32];
1332     U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1333     UCharIterator iter;
1334     uiter_setString(&iter, s, length);
1335     uint32_t state[2] = { 0, 0 };
1336     for(;;) {
1337         int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1338         UBool done = partLength < partSize;
1339         if(done) {
1340             // At the end, append the next byte as well which should be 00.
1341             ++partLength;
1342         }
1343         dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1344         if(done) {
1345             return errorCode.isSuccess();
1346         }
1347     }
1348 }
1349 
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1350 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1351                                      const UChar *s, int32_t length,
1352                                      CollationKey &key, IcuTestErrorCode &errorCode) {
1353     if(errorCode.isFailure()) { return FALSE; }
1354     coll->getCollationKey(s, length, key, errorCode);
1355     if(errorCode.isFailure()) {
1356         infoln(fileTestName);
1357         errln("Collator(%s).getCollationKey() failed: %s",
1358               norm, errorCode.errorName());
1359         infoln(line);
1360         return FALSE;
1361     }
1362     int32_t keyLength;
1363     const uint8_t *keyBytes = key.getByteArray(keyLength);
1364     if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1365         infoln(fileTestName);
1366         errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1367               norm);
1368         infoln(line);
1369         infoln(printCollationKey(key));
1370         return FALSE;
1371     }
1372 
1373     int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1374     if(numLevels < UCOL_IDENTICAL) {
1375         ++numLevels;
1376     } else {
1377         numLevels = 5;
1378     }
1379     if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1380         ++numLevels;
1381     }
1382     errorCode.assertSuccess();
1383     int32_t numLevelSeparators = 0;
1384     for(int32_t i = 0; i < (keyLength - 1); ++i) {
1385         uint8_t b = keyBytes[i];
1386         if(b == 0) {
1387             infoln(fileTestName);
1388             errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1389             infoln(line);
1390             infoln(printCollationKey(key));
1391             return FALSE;
1392         }
1393         if(b == 1) { ++numLevelSeparators; }
1394     }
1395     if(numLevelSeparators != (numLevels - 1)) {
1396         infoln(fileTestName);
1397         errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1398               norm, (int)numLevelSeparators, (int)numLevels);
1399         infoln(line);
1400         infoln(printCollationKey(key));
1401         return FALSE;
1402     }
1403 
1404     // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1405     static const int32_t partSizes[] = { 32, 3, 1 };
1406     for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1407         int32_t partSize = partSizes[psi];
1408         CharString parts;
1409         if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1410             infoln(fileTestName);
1411             errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1412                   norm, (int)partSize, errorCode.errorName());
1413             infoln(line);
1414             return FALSE;
1415         }
1416         if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1417             infoln(fileTestName);
1418             errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1419                   norm, (int)partSize);
1420             infoln(line);
1421             infoln(printCollationKey(key));
1422             infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1423             return FALSE;
1424         }
1425     }
1426     return TRUE;
1427 }
1428 
1429 /**
1430  * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1431  * Leaves key unchanged if s does not contain U+FFFE.
1432  * @return TRUE if the key was successfully changed
1433  */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1434 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1435                                            CollationKey &key, IcuTestErrorCode &errorCode) {
1436     if(errorCode.isFailure()) { return FALSE; }
1437     LocalMemory<uint8_t> mergedKey;
1438     int32_t mergedKeyLength = 0;
1439     int32_t mergedKeyCapacity = 0;
1440     int32_t sLength = (length >= 0) ? length : u_strlen(s);
1441     int32_t segmentStart = 0;
1442     for(int32_t i = 0;;) {
1443         if(i == sLength) {
1444             if(segmentStart == 0) {
1445                 // s does not contain any U+FFFE.
1446                 return FALSE;
1447             }
1448         } else if(s[i] != 0xfffe) {
1449             ++i;
1450             continue;
1451         }
1452         // Get the sort key for another segment and merge it into mergedKey.
1453         CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1454         CollationKey key2;
1455         coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1456         int32_t key1Length, key2Length;
1457         const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1458         const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1459         uint8_t *dest;
1460         int32_t minCapacity = key1Length + key2Length;
1461         if(key1Length > 0) { --minCapacity; }
1462         if(minCapacity <= mergedKeyCapacity) {
1463             dest = mergedKey.getAlias();
1464         } else {
1465             if(minCapacity <= 200) {
1466                 mergedKeyCapacity = 200;
1467             } else if(minCapacity <= 2 * mergedKeyCapacity) {
1468                 mergedKeyCapacity *= 2;
1469             } else {
1470                 mergedKeyCapacity = minCapacity;
1471             }
1472             dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1473         }
1474         U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1475         if(key1Length == 0) {
1476             // key2 is the sort key for the first segment.
1477             uprv_memcpy(dest, key2Bytes, key2Length);
1478             mergedKeyLength = key2Length;
1479         } else {
1480             mergedKeyLength =
1481                 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1482                                    dest, mergedKeyCapacity);
1483         }
1484         if(i == sLength) { break; }
1485         segmentStart = ++i;
1486     }
1487     key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1488     return TRUE;
1489 }
1490 
1491 namespace {
1492 
1493 /**
1494  * Replaces unpaired surrogates with U+FFFD.
1495  * Returns s if no replacement was made, otherwise buffer.
1496  */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1497 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1498     int32_t i = 0;
1499     while(i < s.length()) {
1500         UChar32 c = s.char32At(i);
1501         if(U_IS_SURROGATE(c)) {
1502             if(buffer.length() < i) {
1503                 buffer.append(s, buffer.length(), i - buffer.length());
1504             }
1505             buffer.append((UChar)0xfffd);
1506         }
1507         i += U16_LENGTH(c);
1508     }
1509     if(buffer.isEmpty()) {
1510         return s;
1511     }
1512     if(buffer.length() < i) {
1513         buffer.append(s, buffer.length(), i - buffer.length());
1514     }
1515     return buffer;
1516 }
1517 
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1518 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1519                            UCollationResult order, UBool collHasCaseLevel) {
1520     if(order == UCOL_EQUAL) {
1521         return Collation::NO_LEVEL;
1522     }
1523     int32_t prevKeyLength;
1524     const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1525     int32_t keyLength;
1526     const uint8_t *bytes = key.getByteArray(keyLength);
1527     int32_t level = Collation::PRIMARY_LEVEL;
1528     for(int32_t i = 0;; ++i) {
1529         uint8_t b = prevBytes[i];
1530         if(b != bytes[i]) { break; }
1531         if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1532             ++level;
1533             if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1534                 ++level;
1535             }
1536         }
1537     }
1538     return level;
1539 }
1540 
1541 }
1542 
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1543 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1544                                      const UnicodeString &prevString, const UnicodeString &s,
1545                                      UCollationResult expectedOrder, Collation::Level expectedLevel,
1546                                      IcuTestErrorCode &errorCode) {
1547     if(errorCode.isFailure()) { return FALSE; }
1548 
1549     // Get the sort keys first, for error debug output.
1550     CollationKey prevKey;
1551     if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1552                         prevKey, errorCode)) {
1553         return FALSE;
1554     }
1555     CollationKey key;
1556     if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1557 
1558     UCollationResult order = coll->compare(prevString, s, errorCode);
1559     if(order != expectedOrder || errorCode.isFailure()) {
1560         infoln(fileTestName);
1561         errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1562               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1563         infoln(prevFileLine);
1564         infoln(fileLine);
1565         infoln(printCollationKey(prevKey));
1566         infoln(printCollationKey(key));
1567         return FALSE;
1568     }
1569     order = coll->compare(s, prevString, errorCode);
1570     if(order != -expectedOrder || errorCode.isFailure()) {
1571         infoln(fileTestName);
1572         errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1573               (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1574         infoln(prevFileLine);
1575         infoln(fileLine);
1576         infoln(printCollationKey(prevKey));
1577         infoln(printCollationKey(key));
1578         return FALSE;
1579     }
1580     // Test NUL-termination if the strings do not contain NUL characters.
1581     UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1582     if(!containNUL) {
1583         order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1584         if(order != expectedOrder || errorCode.isFailure()) {
1585             infoln(fileTestName);
1586             errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1587                   (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1588             infoln(prevFileLine);
1589             infoln(fileLine);
1590             infoln(printCollationKey(prevKey));
1591             infoln(printCollationKey(key));
1592             return FALSE;
1593         }
1594         order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1595         if(order != -expectedOrder || errorCode.isFailure()) {
1596             infoln(fileTestName);
1597             errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1598                   (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1599             infoln(prevFileLine);
1600             infoln(fileLine);
1601             infoln(printCollationKey(prevKey));
1602             infoln(printCollationKey(key));
1603             return FALSE;
1604         }
1605     }
1606 
1607     // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1608     // Unpaired surrogates cannot be converted to UTF-8.
1609     // Create valid UTF-16 strings if necessary, and use those for
1610     // both the expected compare() result and for the input to compare(UTF-8).
1611     UnicodeString prevBuffer, sBuffer;
1612     const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1613     const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1614     std::string prevUTF8, sUTF8;
1615     UnicodeString(prevValid).toUTF8String(prevUTF8);
1616     UnicodeString(sValid).toUTF8String(sUTF8);
1617     UCollationResult expectedUTF8Order;
1618     if(&prevValid == &prevString && &sValid == &s) {
1619         expectedUTF8Order = expectedOrder;
1620     } else {
1621         expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1622     }
1623 
1624     order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1625     if(order != expectedUTF8Order || errorCode.isFailure()) {
1626         infoln(fileTestName);
1627         errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1628               (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1629         infoln(prevFileLine);
1630         infoln(fileLine);
1631         infoln(printCollationKey(prevKey));
1632         infoln(printCollationKey(key));
1633         return FALSE;
1634     }
1635     order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1636     if(order != -expectedUTF8Order || errorCode.isFailure()) {
1637         infoln(fileTestName);
1638         errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1639               (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1640         infoln(prevFileLine);
1641         infoln(fileLine);
1642         infoln(printCollationKey(prevKey));
1643         infoln(printCollationKey(key));
1644         return FALSE;
1645     }
1646     // Test NUL-termination if the strings do not contain NUL characters.
1647     if(!containNUL) {
1648         order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1649         if(order != expectedUTF8Order || errorCode.isFailure()) {
1650             infoln(fileTestName);
1651             errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1652                   (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1653             infoln(prevFileLine);
1654             infoln(fileLine);
1655             infoln(printCollationKey(prevKey));
1656             infoln(printCollationKey(key));
1657             return FALSE;
1658         }
1659         order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1660         if(order != -expectedUTF8Order || errorCode.isFailure()) {
1661             infoln(fileTestName);
1662             errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1663                   (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1664             infoln(prevFileLine);
1665             infoln(fileLine);
1666             infoln(printCollationKey(prevKey));
1667             infoln(printCollationKey(key));
1668             return FALSE;
1669         }
1670     }
1671 
1672     UCharIterator leftIter;
1673     UCharIterator rightIter;
1674     uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1675     uiter_setString(&rightIter, s.getBuffer(), s.length());
1676     order = coll->compare(leftIter, rightIter, errorCode);
1677     if(order != expectedOrder || errorCode.isFailure()) {
1678         infoln(fileTestName);
1679         errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1680               "wrong order: %d != %d (%s)",
1681               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1682         infoln(prevFileLine);
1683         infoln(fileLine);
1684         infoln(printCollationKey(prevKey));
1685         infoln(printCollationKey(key));
1686         return FALSE;
1687     }
1688 
1689     order = prevKey.compareTo(key, errorCode);
1690     if(order != expectedOrder || errorCode.isFailure()) {
1691         infoln(fileTestName);
1692         errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1693               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1694         infoln(prevFileLine);
1695         infoln(fileLine);
1696         infoln(printCollationKey(prevKey));
1697         infoln(printCollationKey(key));
1698         return FALSE;
1699     }
1700     UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1701     int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1702     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1703         if(level != expectedLevel) {
1704             infoln(fileTestName);
1705             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1706                   (int)fileLineNumber, norm, order, level, expectedLevel);
1707             infoln(prevFileLine);
1708             infoln(fileLine);
1709             infoln(printCollationKey(prevKey));
1710             infoln(printCollationKey(key));
1711             return FALSE;
1712         }
1713     }
1714 
1715     // If either string contains U+FFFE, then their sort keys must compare the same as
1716     // the merged sort keys of each string's between-FFFE segments.
1717     //
1718     // It is not required that
1719     //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1720     // only that those two methods yield the same order.
1721     //
1722     // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1723     if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1724                 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1725             errorCode.isFailure()) {
1726         order = prevKey.compareTo(key, errorCode);
1727         if(order != expectedOrder || errorCode.isFailure()) {
1728             infoln(fileTestName);
1729             errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1730                 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1731                 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1732             infoln(prevFileLine);
1733             infoln(fileLine);
1734             infoln(printCollationKey(prevKey));
1735             infoln(printCollationKey(key));
1736             return FALSE;
1737         }
1738         int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1739         if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1740             if(mergedLevel != level) {
1741                 infoln(fileTestName);
1742                 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1743                     "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1744                     (int)fileLineNumber, norm, order, mergedLevel, level);
1745                 infoln(prevFileLine);
1746                 infoln(fileLine);
1747                 infoln(printCollationKey(prevKey));
1748                 infoln(printCollationKey(key));
1749                 return FALSE;
1750             }
1751         }
1752     }
1753     return TRUE;
1754 }
1755 
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1756 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1757     if(errorCode.isFailure()) { return; }
1758     UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1759     UnicodeString prevString, s;
1760     prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1761     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1762         // Parse the line even if it will be ignored (when we do not have a Collator)
1763         // in order to report syntax issues.
1764         Collation::Level relation = parseRelationAndString(s, errorCode);
1765         if(errorCode.isFailure()) {
1766             errorCode.reset();
1767             break;
1768         }
1769         if(coll == NULL) {
1770             // We were unable to create the Collator but continue with tests.
1771             // Ignore test data for this Collator.
1772             // The next Collator creation might work.
1773             continue;
1774         }
1775         UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1776         Collation::Level expectedLevel = relation;
1777         s.getTerminatedBuffer();  // Ensure NUL-termination.
1778         UBool isOk = TRUE;
1779         if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1780             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1781             isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1782                                    expectedOrder, expectedLevel, errorCode);
1783         }
1784         if(isOk) {
1785             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1786             isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1787                                    expectedOrder, expectedLevel, errorCode);
1788         }
1789         if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1790             UnicodeString pn = nfd->normalize(prevString, errorCode);
1791             UnicodeString n = nfd->normalize(s, errorCode);
1792             pn.getTerminatedBuffer();
1793             n.getTerminatedBuffer();
1794             errorCode.assertSuccess();
1795             isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1796                                    expectedOrder, expectedLevel, errorCode);
1797         }
1798         if(!isOk) {
1799             errorCode.reset();  // already reported
1800         }
1801         prevFileLine = fileLine;
1802         prevString = s;
1803         prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1804     }
1805 }
1806 
TestDataDriven()1807 void CollationTest::TestDataDriven() {
1808     IcuTestErrorCode errorCode(*this, "TestDataDriven");
1809 
1810     fcd = Normalizer2Factory::getFCDInstance(errorCode);
1811     nfd = Normalizer2::getNFDInstance(errorCode);
1812     if(errorCode.errDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1813         return;
1814     }
1815 
1816     CharString path(getSourceTestData(errorCode), errorCode);
1817     path.appendPathPart("collationtest.txt", errorCode);
1818     const char *codePage = "UTF-8";
1819     LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1820     if(errorCode.errIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1821         return;
1822     }
1823     // Read a new line if necessary.
1824     // Sub-parsers leave the first line set that they do not handle.
1825     while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1826         if(!isSectionStarter(fileLine[0])) {
1827             errln("syntax error on line %d", (int)fileLineNumber);
1828             infoln(fileLine);
1829             return;
1830         }
1831         if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1832             fileTestName = fileLine;
1833             logln(fileLine);
1834             fileLine.remove();
1835         } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1836             setRootCollator(errorCode);
1837             fileLine.remove();
1838         } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1839             setLocaleCollator(errorCode);
1840             fileLine.remove();
1841         } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1842             buildTailoring(f.getAlias(), errorCode);
1843         } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1844             parseAndSetAttribute(errorCode);
1845         } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1846             checkCompareStrings(f.getAlias(), errorCode);
1847         } else {
1848             errln("syntax error on line %d", (int)fileLineNumber);
1849             infoln(fileLine);
1850             return;
1851         }
1852     }
1853 }
1854 
1855 #endif  // !UCONFIG_NO_COLLATION
1856