• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/tblcoll.h"
26 #include "unicode/uiter.h"
27 #include "unicode/uniset.h"
28 #include "unicode/unistr.h"
29 #include "unicode/usetiter.h"
30 #include "unicode/ustring.h"
31 #include "charstr.h"
32 #include "cmemory.h"
33 #include "collation.h"
34 #include "collationdata.h"
35 #include "collationfcd.h"
36 #include "collationiterator.h"
37 #include "collationroot.h"
38 #include "collationrootelements.h"
39 #include "collationruleparser.h"
40 #include "collationweights.h"
41 #include "cstring.h"
42 #include "intltest.h"
43 #include "normalizer2impl.h"
44 #include "ucbuf.h"
45 #include "uhash.h"
46 #include "uitercollationiterator.h"
47 #include "utf16collationiterator.h"
48 #include "utf8collationiterator.h"
49 #include "uvectr32.h"
50 #include "uvectr64.h"
51 #include "writesrc.h"
52 
53 class CodePointIterator;
54 
55 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
56 
57 class CollationTest : public IntlTest {
58 public:
CollationTest()59     CollationTest()
60             : fcd(NULL), nfd(NULL),
61               fileLineNumber(0),
62               coll(NULL) {}
63 
~CollationTest()64     ~CollationTest() {
65         delete coll;
66     }
67 
68     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
69 
70     void TestMinMax();
71     void TestImplicits();
72     void TestNulTerminated();
73     void TestIllegalUTF8();
74     void TestShortFCDData();
75     void TestFCD();
76     void TestCollationWeights();
77     void TestRootElements();
78     void TestTailoredElements();
79     void TestDataDriven();
80 
81 private:
82     void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
83     void checkAllocWeights(CollationWeights &cw,
84                            uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
85                            int32_t someLength, int32_t minCount);
86 
87     static UnicodeString printSortKey(const uint8_t *p, int32_t length);
88     static UnicodeString printCollationKey(const CollationKey &key);
89 
90     // Helpers & fields for data-driven test.
isCROrLF(UChar c)91     static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)92     static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)93     static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
skipSpaces(int32_t i)94     int32_t skipSpaces(int32_t i) {
95         while(isSpace(fileLine[i])) { ++i; }
96         return i;
97     }
98 
99     UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
100     void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
101     Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
102     void parseAndSetAttribute(IcuTestErrorCode &errorCode);
103     void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
104     void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
105     void setRootCollator(IcuTestErrorCode &errorCode);
106     void setLocaleCollator(IcuTestErrorCode &errorCode);
107 
108     UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
109 
110     UBool getSortKeyParts(const UChar *s, int32_t length,
111                           CharString &dest, int32_t partSize,
112                           IcuTestErrorCode &errorCode);
113     UBool getCollationKey(const char *norm, const UnicodeString &line,
114                           const UChar *s, int32_t length,
115                           CollationKey &key, IcuTestErrorCode &errorCode);
116     UBool getMergedCollationKey(const UChar *s, int32_t length,
117                                 CollationKey &key, IcuTestErrorCode &errorCode);
118     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
119                           const UnicodeString &prevString, const UnicodeString &s,
120                           UCollationResult expectedOrder, Collation::Level expectedLevel,
121                           IcuTestErrorCode &errorCode);
122     void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
123 
124     const Normalizer2 *fcd, *nfd;
125     UnicodeString fileLine;
126     int32_t fileLineNumber;
127     UnicodeString fileTestName;
128     Collator *coll;
129 };
130 
createCollationTest()131 extern IntlTest *createCollationTest() {
132     return new CollationTest();
133 }
134 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)135 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
136     if(exec) {
137         logln("TestSuite CollationTest: ");
138     }
139     TESTCASE_AUTO_BEGIN;
140     TESTCASE_AUTO(TestMinMax);
141     TESTCASE_AUTO(TestImplicits);
142     TESTCASE_AUTO(TestNulTerminated);
143     TESTCASE_AUTO(TestIllegalUTF8);
144     TESTCASE_AUTO(TestShortFCDData);
145     TESTCASE_AUTO(TestFCD);
146     TESTCASE_AUTO(TestCollationWeights);
147     TESTCASE_AUTO(TestRootElements);
148     TESTCASE_AUTO(TestTailoredElements);
149     TESTCASE_AUTO(TestDataDriven);
150     TESTCASE_AUTO_END;
151 }
152 
TestMinMax()153 void CollationTest::TestMinMax() {
154     IcuTestErrorCode errorCode(*this, "TestMinMax");
155 
156     setRootCollator(errorCode);
157     if(errorCode.isFailure()) {
158         errorCode.reset();
159         return;
160     }
161     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
162     if(rbc == NULL) {
163         errln("the root collator is not a RuleBasedCollator");
164         return;
165     }
166 
167     static const UChar s[2] = { 0xfffe, 0xffff };
168     UVector64 ces(errorCode);
169     rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
170     errorCode.assertSuccess();
171     if(ces.size() != 2) {
172         errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
173         return;
174     }
175     int64_t ce = ces.elementAti(0);
176     int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
177     if(ce != expected) {
178         errln("CE(U+fffe)=%04lx != 02..", (long)ce);
179     }
180 
181     ce = ces.elementAti(1);
182     expected = Collation::makeCE(Collation::MAX_PRIMARY);
183     if(ce != expected) {
184         errln("CE(U+ffff)=%04lx != max..", (long)ce);
185     }
186 }
187 
TestImplicits()188 void CollationTest::TestImplicits() {
189     IcuTestErrorCode errorCode(*this, "TestImplicits");
190 
191     const CollationData *cd = CollationRoot::getData(errorCode);
192     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
193         return;
194     }
195 
196     // Implicit primary weights should be assigned for the following sets,
197     // and sort in ascending order by set and then code point.
198     // See http://www.unicode.org/reports/tr10/#Implicit_Weights
199 
200     // core Han Unified Ideographs
201     UnicodeSet coreHan("[\\p{unified_ideograph}&"
202                             "[\\p{Block=CJK_Unified_Ideographs}"
203                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
204                        errorCode);
205     // all other Unified Han ideographs
206     UnicodeSet otherHan("[\\p{unified ideograph}-"
207                             "[\\p{Block=CJK_Unified_Ideographs}"
208                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
209                         errorCode);
210     UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
211     unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
212 
213     // Starting with CLDR 26/ICU 54, the root Han order may instead be
214     // the Unihan radical-stroke order.
215     // The tests should pass either way, so we only test the order of a small set of Han characters
216     // whose radical-stroke order is the same as their code point order.
217     UnicodeSet someHanInCPOrder(
218             "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
219             "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
220             errorCode);
221     UnicodeSet inOrder(someHanInCPOrder);
222     inOrder.addAll(unassigned).freeze();
223     if(errorCode.logIfFailureAndReset("UnicodeSet")) {
224         return;
225     }
226     const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
227     UChar32 prev = 0;
228     uint32_t prevPrimary = 0;
229     UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
230     for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
231         LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
232         while(iter->next()) {
233             UChar32 c = iter->getCodepoint();
234             UnicodeString s(c);
235             ci.setText(s.getBuffer(), s.getBuffer() + s.length());
236             int64_t ce = ci.nextCE(errorCode);
237             int64_t ce2 = ci.nextCE(errorCode);
238             if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
239                 return;
240             }
241             if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
242                 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
243                 continue;
244             }
245             if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
246                 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
247                       (long)c, (long)(ce & 0xffffffff));
248                 continue;
249             }
250             uint32_t primary = (uint32_t)(ce >> 32);
251             if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
252                 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
253                       (long)c, (long)primary, (long)prev, (long)prevPrimary);
254             }
255             prev = c;
256             prevPrimary = primary;
257         }
258     }
259 }
260 
TestNulTerminated()261 void CollationTest::TestNulTerminated() {
262     IcuTestErrorCode errorCode(*this, "TestNulTerminated");
263     const CollationData *data = CollationRoot::getData(errorCode);
264     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
265         return;
266     }
267 
268     static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
269 
270     UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
271     UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
272     for(int32_t i = 0;; ++i) {
273         int64_t ce1 = ci1.nextCE(errorCode);
274         int64_t ce2 = ci2.nextCE(errorCode);
275         if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
276             return;
277         }
278         if(ce1 != ce2) {
279             errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
280             break;
281         }
282         if(ce1 == Collation::NO_CE) { break; }
283     }
284 }
285 
TestIllegalUTF8()286 void CollationTest::TestIllegalUTF8() {
287     IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
288 
289     setRootCollator(errorCode);
290     if(errorCode.isFailure()) {
291         errorCode.reset();
292         return;
293     }
294     coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
295 
296     static const char *strings[] = {
297         // U+FFFD
298         "a\xef\xbf\xbdz",
299         // illegal byte sequences
300         "a\x80z",  // trail byte
301         "a\xc1\x81z",  // non-shortest form
302         "a\xe0\x82\x83z",  // non-shortest form
303         "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
304         "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
305         "a\xf0\x8f\xbf\xbfz",  // non-shortest form
306         "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
307     };
308 
309     StringPiece fffd(strings[0]);
310     for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
311         StringPiece illegal(strings[i]);
312         UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
313         if(order != UCOL_EQUAL) {
314             errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
315                   (int)i, order);
316         }
317     }
318 }
319 
320 namespace {
321 
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)322 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
323     for(UChar32 c = 0x10000; c < 0x110000;) {
324         UChar32 next = c + 0x400;
325         if(src.containsSome(c, next - 1)) {
326             dest.add(U16_LEAD(c));
327         }
328         c = next;
329     }
330 }
331 
332 }  // namespace
333 
TestShortFCDData()334 void CollationTest::TestShortFCDData() {
335     // See CollationFCD class comments.
336     IcuTestErrorCode errorCode(*this, "TestShortFCDData");
337     UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
338     errorCode.assertSuccess();
339     expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
340     addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
341     UnicodeSet lccc;  // actual
342     for(UChar32 c = 0; c <= 0xffff; ++c) {
343         if(CollationFCD::hasLccc(c)) { lccc.add(c); }
344     }
345     UnicodeSet diff(expectedLccc);
346     diff.removeAll(lccc);
347     diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
348     UnicodeString empty("[]");
349     UnicodeString diffString;
350     diff.toPattern(diffString, TRUE);
351     assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
352     diff = lccc;
353     diff.removeAll(expectedLccc);
354     diff.toPattern(diffString, TRUE);
355     assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
356 
357     UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
358     if (errorCode.isSuccess()) {
359         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
360         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
361         UnicodeSet tccc;  // actual
362         for(UChar32 c = 0; c <= 0xffff; ++c) {
363             if(CollationFCD::hasTccc(c)) { tccc.add(c); }
364         }
365         diff = expectedTccc;
366         diff.removeAll(tccc);
367         diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
368         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
369         diff = tccc;
370         diff.removeAll(expectedTccc);
371         diff.toPattern(diffString, TRUE);
372         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
373     }
374 }
375 
376 class CodePointIterator {
377 public:
CodePointIterator(const UChar32 * cp,int32_t length)378     CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()379     void resetToStart() { pos = 0; }
next()380     UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()381     UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const382     int32_t getLength() const { return length; }
getIndex() const383     int getIndex() const { return (int)pos; }
384 private:
385     const UChar32 *cp;
386     int32_t length;
387     int32_t pos;
388 };
389 
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)390 void CollationTest::checkFCD(const char *name,
391                              CollationIterator &ci, CodePointIterator &cpi) {
392     IcuTestErrorCode errorCode(*this, "checkFCD");
393 
394     // Iterate forward to the limit.
395     for(;;) {
396         UChar32 c1 = ci.nextCodePoint(errorCode);
397         UChar32 c2 = cpi.next();
398         if(c1 != c2) {
399             errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
400                   name, (long)c1, (long)c2, cpi.getIndex());
401             return;
402         }
403         if(c1 < 0) { break; }
404     }
405 
406     // Iterate backward most of the way.
407     for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
408         UChar32 c1 = ci.previousCodePoint(errorCode);
409         UChar32 c2 = cpi.previous();
410         if(c1 != c2) {
411             errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
412                   name, (long)c1, (long)c2, cpi.getIndex());
413             return;
414         }
415     }
416 
417     // Forward again.
418     for(;;) {
419         UChar32 c1 = ci.nextCodePoint(errorCode);
420         UChar32 c2 = cpi.next();
421         if(c1 != c2) {
422             errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
423                   name, (long)c1, (long)c2, cpi.getIndex());
424             return;
425         }
426         if(c1 < 0) { break; }
427     }
428 
429     // Iterate backward to the start.
430     for(;;) {
431         UChar32 c1 = ci.previousCodePoint(errorCode);
432         UChar32 c2 = cpi.previous();
433         if(c1 != c2) {
434             errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
435                   name, (long)c1, (long)c2, cpi.getIndex());
436             return;
437         }
438         if(c1 < 0) { break; }
439     }
440 }
441 
TestFCD()442 void CollationTest::TestFCD() {
443     IcuTestErrorCode errorCode(*this, "TestFCD");
444     const CollationData *data = CollationRoot::getData(errorCode);
445     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
446         return;
447     }
448 
449     // Input string, not FCD, NUL-terminated.
450     static const UChar s[] = {
451         0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
452         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
453         0x327, 0x308,  // ccc=202, 230
454         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
455         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
456         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
457         0xac01,
458         0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
459         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
460         0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
461         0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
462         0x4e00, 0xf81,
463         0
464     };
465     // Expected code points.
466     static const UChar32 cp[] = {
467         0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
468         0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
469         0x1D15F, 0x1D16D,
470         0xac01,
471         0x63, 0x327, 0x1D165, 0x1D16D,
472         0x61,
473         0xf71, 0xf71, 0xf72, 0xf74, 0x301,
474         0x4e00, 0xf71, 0xf80
475     };
476 
477     FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
478     if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
479         return;
480     }
481     CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
482     checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
483 
484 #if U_HAVE_STD_STRING
485     cpi.resetToStart();
486     std::string utf8;
487     UnicodeString(s).toUTF8String(utf8);
488     FCDUTF8CollationIterator u8ci(data, FALSE,
489                                   reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
490     if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
491         return;
492     }
493     checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
494 #endif
495 
496     cpi.resetToStart();
497     UCharIterator iter;
498     uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL
499     FCDUIterCollationIterator uici(data, FALSE, iter, 0);
500     if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
501         return;
502     }
503     checkFCD("FCDUIterCollationIterator", uici, cpi);
504 }
505 
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)506 void CollationTest::checkAllocWeights(CollationWeights &cw,
507                                       uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
508                                       int32_t someLength, int32_t minCount) {
509     if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
510         errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
511               (long)lowerLimit, (long)upperLimit, (long)n);
512         return;
513     }
514     uint32_t previous = lowerLimit;
515     int32_t count = 0;  // number of weights that have someLength
516     for(int32_t i = 0; i < n; ++i) {
517         uint32_t w = cw.nextWeight();
518         if(w == 0xffffffff) {
519             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
520                   "returns only %ld weights",
521                   (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
522             return;
523         }
524         if(!(previous < w && w < upperLimit)) {
525             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
526                   "number %ld -> %lx not between %lx and %lx",
527                   (long)lowerLimit, (long)upperLimit, (long)n,
528                   (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
529             return;
530         }
531         if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
532     }
533     if(count < minCount) {
534         errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
535               "returns only %ld < %ld weights of length %d",
536               (long)lowerLimit, (long)upperLimit, (long)n,
537               (long)count, (long)minCount, (int)someLength);
538     }
539 }
540 
TestCollationWeights()541 void CollationTest::TestCollationWeights() {
542     CollationWeights cw;
543 
544     // Non-compressible primaries use 254 second bytes 02..FF.
545     logln("CollationWeights.initForPrimary(non-compressible)");
546     cw.initForPrimary(FALSE);
547     // Expect 1 weight 11 and 254 weights 12xx.
548     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
549     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
550     // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
551     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
552     // Expect 254 two-byte weights from the ranges 10ff and 11xx.
553     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
554     // Expect 254^2=64516 three-byte weights.
555     // During computation, there should be 3 three-byte ranges
556     // 10ffff, 11xxxx, 120202.
557     // The middle one should be split 64515:1,
558     // and the newly-split-off range and the last ranged lengthened.
559     checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
560     // Expect weights 1102 & 1103.
561     checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
562     // Expect weights 102102 & 102103.
563     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
564 
565     // Compressible primaries use 251 second bytes 04..FE.
566     logln("CollationWeights.initForPrimary(compressible)");
567     cw.initForPrimary(TRUE);
568     // Expect 1 weight 11 and 251 weights 12xx.
569     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
570     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
571     // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
572     checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
573     // Expect weights 1104 & 1105.
574     checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
575     // Expect weights 102102 & 102103.
576     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
577 
578     // Secondary and tertiary weights use only bytes 3 & 4.
579     logln("CollationWeights.initForSecondary()");
580     cw.initForSecondary();
581     // Expect weights fbxx and all four fc..ff.
582     checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
583 
584     logln("CollationWeights.initForTertiary()");
585     cw.initForTertiary();
586     // Expect weights 3dxx and both 3e & 3f.
587     checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
588 }
589 
590 namespace {
591 
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)592 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
593                 uint32_t p, uint32_t s, uint32_t ctq) {
594     uint32_t p1 = p >> 24;
595     uint32_t p2 = (p >> 16) & 0xff;
596     uint32_t p3 = (p >> 8) & 0xff;
597     uint32_t p4 = p & 0xff;
598     uint32_t s1 = s >> 8;
599     uint32_t s2 = s & 0xff;
600     // ctq = Case, Tertiary, Quaternary
601     uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
602     uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
603     uint32_t t1 = t >> 8;
604     uint32_t t2 = t & 0xff;
605     uint32_t q = ctq & Collation::QUATERNARY_MASK;
606     // No leading zero bytes.
607     if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
608         return FALSE;
609     }
610     // No intermediate zero bytes.
611     if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
612         return FALSE;
613     }
614     if(p2 != 0 && p3 == 0 && p4 != 0) {
615         return FALSE;
616     }
617     // Minimum & maximum lead bytes.
618     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
619             s1 == Collation::LEVEL_SEPARATOR_BYTE ||
620             t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
621         return FALSE;
622     }
623     if(c > 2) {
624         return FALSE;
625     }
626     // The valid byte range for the second primary byte depends on compressibility.
627     if(p2 != 0) {
628         if(data.isCompressibleLeadByte(p1)) {
629             if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
630                     Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
631                 return FALSE;
632             }
633         } else {
634             if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
635                 return FALSE;
636             }
637         }
638     }
639     // Other bytes just need to avoid the level separator.
640     // Trailing zeros are ok.
641     U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
642     if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
643             s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
644         return FALSE;
645     }
646     // Well-formed CEs.
647     if(p == 0) {
648         if(s == 0) {
649             if(t == 0) {
650                 // Completely ignorable CE.
651                 // Quaternary CEs are not supported.
652                 if(c != 0 || q != 0) {
653                     return FALSE;
654                 }
655             } else {
656                 // Tertiary CE.
657                 if(t < re.getTertiaryBoundary() || c != 2) {
658                     return FALSE;
659                 }
660             }
661         } else {
662             // Secondary CE.
663             if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
664                 return FALSE;
665             }
666         }
667     } else {
668         // Primary CE.
669         if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
670                 s >= re.getSecondaryBoundary()) {
671             return FALSE;
672         }
673         if(t == 0 || t >= re.getTertiaryBoundary()) {
674             return FALSE;
675         }
676     }
677     return TRUE;
678 }
679 
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)680 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
681     uint32_t p = (uint32_t)(ce >> 32);
682     uint32_t secTer = (uint32_t)ce;
683     return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
684 }
685 
686 class RootElementsIterator {
687 public:
RootElementsIterator(const CollationData & root)688     RootElementsIterator(const CollationData &root)
689             : data(root),
690               elements(root.rootElements), length(root.rootElementsLength),
691               pri(0), secTer(0),
692               index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
693 
next()694     UBool next() {
695         if(index >= length) { return FALSE; }
696         uint32_t p = elements[index];
697         if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
698         if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
699             ++index;
700             secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
701             return TRUE;
702         }
703         if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
704             // End of a range, enumerate the primaries in the range.
705             int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
706             p &= 0xffffff00;
707             if(pri == p) {
708                 // Finished the range, return the next CE after it.
709                 ++index;
710                 return next();
711             }
712             U_ASSERT(pri < p);
713             // Return the next primary in this range.
714             UBool isCompressible = data.isCompressiblePrimary(pri);
715             if((pri & 0xffff) == 0) {
716                 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
717             } else {
718                 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
719             }
720             return TRUE;
721         }
722         // Simple primary CE.
723         ++index;
724         pri = p;
725         // Does this have an explicit below-common sec/ter unit,
726         // or does it imply a common one?
727         if(index == length) {
728             secTer = Collation::COMMON_SEC_AND_TER_CE;
729         } else {
730             secTer = elements[index];
731             if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
732                 // No sec/ter delta.
733                 secTer = Collation::COMMON_SEC_AND_TER_CE;
734             } else {
735                 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
736                 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
737                     // Implied sec/ter.
738                     secTer = Collation::COMMON_SEC_AND_TER_CE;
739                 } else {
740                     // Explicit sec/ter below common/common.
741                     ++index;
742                 }
743             }
744         }
745         return TRUE;
746     }
747 
getPrimary() const748     uint32_t getPrimary() const { return pri; }
getSecTer() const749     uint32_t getSecTer() const { return secTer; }
750 
751 private:
752     const CollationData &data;
753     const uint32_t *elements;
754     int32_t length;
755 
756     uint32_t pri;
757     uint32_t secTer;
758     int32_t index;
759 };
760 
761 }  // namespace
762 
TestRootElements()763 void CollationTest::TestRootElements() {
764     IcuTestErrorCode errorCode(*this, "TestRootElements");
765     const CollationData *root = CollationRoot::getData(errorCode);
766     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
767         return;
768     }
769     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
770     RootElementsIterator iter(*root);
771 
772     // We check each root CE for validity,
773     // and we also verify that there is a tailoring gap between each two CEs.
774     CollationWeights cw1c;  // compressible primary weights
775     CollationWeights cw1u;  // uncompressible primary weights
776     CollationWeights cw2;
777     CollationWeights cw3;
778 
779     cw1c.initForPrimary(TRUE);
780     cw1u.initForPrimary(FALSE);
781     cw2.initForSecondary();
782     cw3.initForTertiary();
783 
784     // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
785     // nor the special merge-separator CE for U+FFFE.
786     uint32_t prevPri = 0;
787     uint32_t prevSec = 0;
788     uint32_t prevTer = 0;
789     while(iter.next()) {
790         uint32_t pri = iter.getPrimary();
791         uint32_t secTer = iter.getSecTer();
792         // CollationRootElements CEs must have 0 case and quaternary bits.
793         if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
794             errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
795                   (long)pri, (long)secTer);
796         }
797         uint32_t sec = secTer >> 16;
798         uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
799         uint32_t ctq = ter;
800         if(pri == 0 && sec == 0 && ter != 0) {
801             // Tertiary CEs must have uppercase bits,
802             // but they are not stored in the CollationRootElements.
803             ctq |= 0x8000;
804         }
805         if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
806             errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
807         } else {
808             if(pri != prevPri) {
809                 uint32_t newWeight = 0;
810                 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
811                     // There is currently no tailoring gap after primary ignorables,
812                     // and we forbid tailoring after U+FFFD and U+FFFF.
813                 } else if(root->isCompressiblePrimary(prevPri)) {
814                     if(!cw1c.allocWeights(prevPri, pri, 1)) {
815                         errln("no primary/compressible tailoring gap between %08lx and %08lx",
816                               (long)prevPri, (long)pri);
817                     } else {
818                         newWeight = cw1c.nextWeight();
819                     }
820                 } else {
821                     if(!cw1u.allocWeights(prevPri, pri, 1)) {
822                         errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
823                               (long)prevPri, (long)pri);
824                     } else {
825                         newWeight = cw1u.nextWeight();
826                     }
827                 }
828                 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
829                     errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
830                           (long)prevPri, (long)newWeight, (long)pri);
831                 }
832             } else if(sec != prevSec) {
833                 uint32_t lowerLimit =
834                     prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
835                 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
836                     errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
837                 } else {
838                     uint32_t newWeight = cw2.nextWeight();
839                     if(!(prevSec < newWeight && newWeight < sec)) {
840                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
841                               (long)lowerLimit, (long)newWeight, (long)sec);
842                     }
843                 }
844             } else if(ter != prevTer) {
845                 uint32_t lowerLimit =
846                     prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
847                 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
848                     errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
849                 } else {
850                     uint32_t newWeight = cw3.nextWeight();
851                     if(!(prevTer < newWeight && newWeight < ter)) {
852                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
853                               (long)lowerLimit, (long)newWeight, (long)ter);
854                     }
855                 }
856             } else {
857                 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
858             }
859         }
860         prevPri = pri;
861         prevSec = sec;
862         prevTer = ter;
863     }
864 }
865 
TestTailoredElements()866 void CollationTest::TestTailoredElements() {
867     IcuTestErrorCode errorCode(*this, "TestTailoredElements");
868     const CollationData *root = CollationRoot::getData(errorCode);
869     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
870         return;
871     }
872     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
873 
874     UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
875     if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
876         return;
877     }
878     uhash_setKeyDeleter(prevLocales, uprv_free);
879     // TestRootElements() tests the root collator which does not have tailorings.
880     uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
881     uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
882     uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
883 
884     UVector64 ces(errorCode);
885     LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
886     U_ASSERT(locales.isValid());
887     const char *localeID = "root";
888     do {
889         Locale locale(localeID);
890         LocalPointer<StringEnumeration> types(
891                 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
892         errorCode.assertSuccess();
893         const char *type;  // first: default type
894         while((type = types->next(NULL, errorCode)) != NULL) {
895             if(strncmp(type, "private-", 8) == 0) {
896                 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
897                         localeID, type);
898             }
899             Locale localeWithType(locale);
900             localeWithType.setKeywordValue("collation", type, errorCode);
901             errorCode.assertSuccess();
902             LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
903             if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
904                                               localeWithType.getName())) {
905                 continue;
906             }
907             Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
908             if(uhash_geti(prevLocales, actual.getName()) != 0) {
909                 continue;
910             }
911             uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
912             errorCode.assertSuccess();
913             logln("TestTailoredElements(): requested %s -> actual %s",
914                   localeWithType.getName(), actual.getName());
915             RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
916             if(rbc == NULL) {
917                 continue;
918             }
919             // Note: It would be better to get tailored strings such that we can
920             // identify the prefix, and only get the CEs for the prefix+string,
921             // not also for the prefix.
922             // There is currently no API for that.
923             // It would help in an unusual case where a contraction starting in the prefix
924             // extends past its end, and we do not see the intended mapping.
925             // For example, for a mapping p|st, if there is also a contraction ps,
926             // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
927             LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
928             errorCode.assertSuccess();
929             UnicodeSetIterator iter(*tailored);
930             while(iter.next()) {
931                 const UnicodeString &s = iter.getString();
932                 ces.removeAllElements();
933                 rbc->internalGetCEs(s, ces, errorCode);
934                 errorCode.assertSuccess();
935                 for(int32_t i = 0; i < ces.size(); ++i) {
936                     int64_t ce = ces.elementAti(i);
937                     if(!isValidCE(rootElements, *root, ce)) {
938                         errln("invalid tailored CE %016llx at CE index %d from string:",
939                               (long long)ce, (int)i);
940                         infoln(prettify(s));
941                     }
942                 }
943             }
944         }
945     } while((localeID = locales->next(NULL, errorCode)) != NULL);
946     uhash_close(prevLocales);
947 }
948 
printSortKey(const uint8_t * p,int32_t length)949 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
950     UnicodeString s;
951     for(int32_t i = 0; i < length; ++i) {
952         if(i > 0) { s.append((UChar)0x20); }
953         uint8_t b = p[i];
954         if(b == 0) {
955             s.append((UChar)0x2e);  // period
956         } else if(b == 1) {
957             s.append((UChar)0x7c);  // vertical bar
958         } else {
959             appendHex(b, 2, s);
960         }
961     }
962     return s;
963 }
964 
printCollationKey(const CollationKey & key)965 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
966     int32_t length;
967     const uint8_t *p = key.getByteArray(length);
968     return printSortKey(p, length);
969 }
970 
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)971 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
972     for(;;) {
973         int32_t lineLength;
974         const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
975         if(line == NULL || errorCode.isFailure()) {
976             fileLine.remove();
977             return FALSE;
978         }
979         ++fileLineNumber;
980         // Strip trailing CR/LF, comments, and spaces.
981         const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
982         if(comment != NULL) {
983             lineLength = (int32_t)(comment - line);
984         } else {
985             while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
986         }
987         while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
988         if(lineLength != 0) {
989             fileLine.setTo(FALSE, line, lineLength);
990             return TRUE;
991         }
992         // Empty line, continue.
993     }
994 }
995 
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)996 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
997                                 UErrorCode &errorCode) {
998     int32_t length = fileLine.length();
999     int32_t i;
1000     for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
1001     int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
1002     if(pipeIndex >= 0) {
1003         prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1004         if(prefix.isEmpty()) {
1005             errln("empty prefix on line %d", (int)fileLineNumber);
1006             infoln(fileLine);
1007             errorCode = U_PARSE_ERROR;
1008             return;
1009         }
1010         start = pipeIndex + 1;
1011     } else {
1012         prefix.remove();
1013     }
1014     s = fileLine.tempSubStringBetween(start, i).unescape();
1015     if(s.isEmpty()) {
1016         errln("empty string on line %d", (int)fileLineNumber);
1017         infoln(fileLine);
1018         errorCode = U_PARSE_ERROR;
1019         return;
1020     }
1021     start = i;
1022 }
1023 
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1024 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1025     Collation::Level relation;
1026     int32_t start;
1027     if(fileLine[0] == 0x3c) {  // <
1028         UChar second = fileLine[1];
1029         start = 2;
1030         switch(second) {
1031         case 0x31:  // <1
1032             relation = Collation::PRIMARY_LEVEL;
1033             break;
1034         case 0x32:  // <2
1035             relation = Collation::SECONDARY_LEVEL;
1036             break;
1037         case 0x33:  // <3
1038             relation = Collation::TERTIARY_LEVEL;
1039             break;
1040         case 0x34:  // <4
1041             relation = Collation::QUATERNARY_LEVEL;
1042             break;
1043         case 0x63:  // <c
1044             relation = Collation::CASE_LEVEL;
1045             break;
1046         case 0x69:  // <i
1047             relation = Collation::IDENTICAL_LEVEL;
1048             break;
1049         default:  // just <
1050             relation = Collation::NO_LEVEL;
1051             start = 1;
1052             break;
1053         }
1054     } else if(fileLine[0] == 0x3d) {  // =
1055         relation = Collation::ZERO_LEVEL;
1056         start = 1;
1057     } else {
1058         start = 0;
1059     }
1060     if(start == 0 || !isSpace(fileLine[start])) {
1061         errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1062         infoln(fileLine);
1063         errorCode.set(U_PARSE_ERROR);
1064         return Collation::NO_LEVEL;
1065     }
1066     start = skipSpaces(start);
1067     UnicodeString prefix;
1068     parseString(start, prefix, s, errorCode);
1069     if(errorCode.isSuccess() && !prefix.isEmpty()) {
1070         errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1071         infoln(fileLine);
1072         errorCode.set(U_PARSE_ERROR);
1073         return Collation::NO_LEVEL;
1074     }
1075     if(start < fileLine.length()) {
1076         errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1077         infoln(fileLine);
1078         errorCode.set(U_PARSE_ERROR);
1079         return Collation::NO_LEVEL;
1080     }
1081     return relation;
1082 }
1083 
1084 static const struct {
1085     const char *name;
1086     UColAttribute attr;
1087 } attributes[] = {
1088     { "backwards", UCOL_FRENCH_COLLATION },
1089     { "alternate", UCOL_ALTERNATE_HANDLING },
1090     { "caseFirst", UCOL_CASE_FIRST },
1091     { "caseLevel", UCOL_CASE_LEVEL },
1092     // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1093     { "strength", UCOL_STRENGTH },
1094     // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1095     { "numeric", UCOL_NUMERIC_COLLATION }
1096 };
1097 
1098 static const struct {
1099     const char *name;
1100     UColAttributeValue value;
1101 } attributeValues[] = {
1102     { "default", UCOL_DEFAULT },
1103     { "primary", UCOL_PRIMARY },
1104     { "secondary", UCOL_SECONDARY },
1105     { "tertiary", UCOL_TERTIARY },
1106     { "quaternary", UCOL_QUATERNARY },
1107     { "identical", UCOL_IDENTICAL },
1108     { "off", UCOL_OFF },
1109     { "on", UCOL_ON },
1110     { "shifted", UCOL_SHIFTED },
1111     { "non-ignorable", UCOL_NON_IGNORABLE },
1112     { "lower", UCOL_LOWER_FIRST },
1113     { "upper", UCOL_UPPER_FIRST }
1114 };
1115 
parseAndSetAttribute(IcuTestErrorCode & errorCode)1116 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1117     // Parse attributes even if the Collator could not be created,
1118     // in order to report syntax errors.
1119     int32_t start = skipSpaces(1);
1120     int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1121     if(equalPos < 0) {
1122         if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1123             parseAndSetReorderCodes(start + 7, errorCode);
1124             return;
1125         }
1126         errln("missing '=' on line %d", (int)fileLineNumber);
1127         infoln(fileLine);
1128         errorCode.set(U_PARSE_ERROR);
1129         return;
1130     }
1131 
1132     UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1133     UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1134     if(attrString == UNICODE_STRING("maxVariable", 11)) {
1135         UColReorderCode max;
1136         if(valueString == UNICODE_STRING("space", 5)) {
1137             max = UCOL_REORDER_CODE_SPACE;
1138         } else if(valueString == UNICODE_STRING("punct", 5)) {
1139             max = UCOL_REORDER_CODE_PUNCTUATION;
1140         } else if(valueString == UNICODE_STRING("symbol", 6)) {
1141             max = UCOL_REORDER_CODE_SYMBOL;
1142         } else if(valueString == UNICODE_STRING("currency", 8)) {
1143             max = UCOL_REORDER_CODE_CURRENCY;
1144         } else {
1145             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1146             infoln(fileLine);
1147             errorCode.set(U_PARSE_ERROR);
1148             return;
1149         }
1150         if(coll != NULL) {
1151             coll->setMaxVariable(max, errorCode);
1152             if(errorCode.isFailure()) {
1153                 errln("setMaxVariable() failed on line %d: %s",
1154                       (int)fileLineNumber, errorCode.errorName());
1155                 infoln(fileLine);
1156                 return;
1157             }
1158         }
1159         fileLine.remove();
1160         return;
1161     }
1162 
1163     UColAttribute attr;
1164     for(int32_t i = 0;; ++i) {
1165         if(i == UPRV_LENGTHOF(attributes)) {
1166             errln("invalid attribute name on line %d", (int)fileLineNumber);
1167             infoln(fileLine);
1168             errorCode.set(U_PARSE_ERROR);
1169             return;
1170         }
1171         if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1172             attr = attributes[i].attr;
1173             break;
1174         }
1175     }
1176 
1177     UColAttributeValue value;
1178     for(int32_t i = 0;; ++i) {
1179         if(i == UPRV_LENGTHOF(attributeValues)) {
1180             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1181             infoln(fileLine);
1182             errorCode.set(U_PARSE_ERROR);
1183             return;
1184         }
1185         if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1186             value = attributeValues[i].value;
1187             break;
1188         }
1189     }
1190 
1191     if(coll != NULL) {
1192         coll->setAttribute(attr, value, errorCode);
1193         if(errorCode.isFailure()) {
1194             errln("illegal attribute=value combination on line %d: %s",
1195                   (int)fileLineNumber, errorCode.errorName());
1196             infoln(fileLine);
1197             return;
1198         }
1199     }
1200     fileLine.remove();
1201 }
1202 
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1203 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1204     UVector32 reorderCodes(errorCode);
1205     while(start < fileLine.length()) {
1206         start = skipSpaces(start);
1207         int32_t limit = start;
1208         while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1209         CharString name;
1210         name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1211         int32_t code = CollationRuleParser::getReorderCode(name.data());
1212         if(code < 0) {
1213             if(uprv_stricmp(name.data(), "default") == 0) {
1214                 code = UCOL_REORDER_CODE_DEFAULT;  // -1
1215             } else {
1216                 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1217                 infoln(fileLine);
1218                 errorCode.set(U_PARSE_ERROR);
1219                 return;
1220             }
1221         }
1222         reorderCodes.addElement(code, errorCode);
1223         start = limit;
1224     }
1225     if(coll != NULL) {
1226         coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1227         if(errorCode.isFailure()) {
1228             errln("setReorderCodes() failed on line %d: %s",
1229                   (int)fileLineNumber, errorCode.errorName());
1230             infoln(fileLine);
1231             return;
1232         }
1233     }
1234     fileLine.remove();
1235 }
1236 
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1237 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1238     UnicodeString rules;
1239     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1240         rules.append(fileLine.unescape());
1241     }
1242     if(errorCode.isFailure()) { return; }
1243     logln(rules);
1244 
1245     UParseError parseError;
1246     UnicodeString reason;
1247     delete coll;
1248     coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1249     if(coll == NULL) {
1250         errln("unable to allocate a new collator");
1251         errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1252         return;
1253     }
1254     if(errorCode.isFailure()) {
1255         dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1256         infoln(UnicodeString("  reason: ") + reason);
1257         if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1258         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1259             infoln(UnicodeString("  snippet: ...") +
1260                 parseError.preContext + "(!)" + parseError.postContext + "...");
1261         }
1262         delete coll;
1263         coll = NULL;
1264         errorCode.reset();
1265     } else {
1266         assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1267                      UnicodeString(), reason);
1268     }
1269 }
1270 
setRootCollator(IcuTestErrorCode & errorCode)1271 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1272     if(errorCode.isFailure()) { return; }
1273     delete coll;
1274     coll = Collator::createInstance(Locale::getRoot(), errorCode);
1275     if(errorCode.isFailure()) {
1276         dataerrln("unable to create a root collator");
1277         return;
1278     }
1279 }
1280 
setLocaleCollator(IcuTestErrorCode & errorCode)1281 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1282     if(errorCode.isFailure()) { return; }
1283     delete coll;
1284     coll = NULL;
1285     int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1286     if(at >= 0) {
1287         fileLine.setCharAt(at, (UChar)0x2a);  // *
1288     }
1289     CharString localeID;
1290     localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1291     if(at >= 0) {
1292         localeID.data()[at - 9] = '@';
1293     }
1294     Locale locale(localeID.data());
1295     if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1296         errln("invalid language tag on line %d", (int)fileLineNumber);
1297         infoln(fileLine);
1298         if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1299         return;
1300     }
1301 
1302     logln("creating a collator for locale ID %s", locale.getName());
1303     coll = Collator::createInstance(locale, errorCode);
1304     if(errorCode.isFailure()) {
1305         dataerrln("unable to create a collator for locale %s on line %d",
1306                   locale.getName(), (int)fileLineNumber);
1307         infoln(fileLine);
1308         delete coll;
1309         coll = NULL;
1310         errorCode.reset();
1311     }
1312 }
1313 
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1314 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1315     if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1316     // In some sequences with Tibetan composite vowel signs,
1317     // even if the string passes the FCD check,
1318     // those composites must be decomposed.
1319     // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1320     int32_t index = 0;
1321     while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1322         if(++index < s.length()) {
1323             UChar c = s[index];
1324             if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1325         }
1326     }
1327     return FALSE;
1328 }
1329 
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1330 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1331                                      CharString &dest, int32_t partSize,
1332                                      IcuTestErrorCode &errorCode) {
1333     if(errorCode.isFailure()) { return FALSE; }
1334     uint8_t part[32];
1335     U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1336     UCharIterator iter;
1337     uiter_setString(&iter, s, length);
1338     uint32_t state[2] = { 0, 0 };
1339     for(;;) {
1340         int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1341         UBool done = partLength < partSize;
1342         if(done) {
1343             // At the end, append the next byte as well which should be 00.
1344             ++partLength;
1345         }
1346         dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1347         if(done) {
1348             return errorCode.isSuccess();
1349         }
1350     }
1351 }
1352 
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1353 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1354                                      const UChar *s, int32_t length,
1355                                      CollationKey &key, IcuTestErrorCode &errorCode) {
1356     if(errorCode.isFailure()) { return FALSE; }
1357     coll->getCollationKey(s, length, key, errorCode);
1358     if(errorCode.isFailure()) {
1359         infoln(fileTestName);
1360         errln("Collator(%s).getCollationKey() failed: %s",
1361               norm, errorCode.errorName());
1362         infoln(line);
1363         return FALSE;
1364     }
1365     int32_t keyLength;
1366     const uint8_t *keyBytes = key.getByteArray(keyLength);
1367     if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1368         infoln(fileTestName);
1369         errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1370               norm);
1371         infoln(line);
1372         infoln(printCollationKey(key));
1373         return FALSE;
1374     }
1375 
1376     int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1377     if(numLevels < UCOL_IDENTICAL) {
1378         ++numLevels;
1379     } else {
1380         numLevels = 5;
1381     }
1382     if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1383         ++numLevels;
1384     }
1385     errorCode.assertSuccess();
1386     int32_t numLevelSeparators = 0;
1387     for(int32_t i = 0; i < (keyLength - 1); ++i) {
1388         uint8_t b = keyBytes[i];
1389         if(b == 0) {
1390             infoln(fileTestName);
1391             errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1392             infoln(line);
1393             infoln(printCollationKey(key));
1394             return FALSE;
1395         }
1396         if(b == 1) { ++numLevelSeparators; }
1397     }
1398     if(numLevelSeparators != (numLevels - 1)) {
1399         infoln(fileTestName);
1400         errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1401               norm, (int)numLevelSeparators, (int)numLevels);
1402         infoln(line);
1403         infoln(printCollationKey(key));
1404         return FALSE;
1405     }
1406 
1407     // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1408     static const int32_t partSizes[] = { 32, 3, 1 };
1409     for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1410         int32_t partSize = partSizes[psi];
1411         CharString parts;
1412         if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1413             infoln(fileTestName);
1414             errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1415                   norm, (int)partSize, errorCode.errorName());
1416             infoln(line);
1417             return FALSE;
1418         }
1419         if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1420             infoln(fileTestName);
1421             errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1422                   norm, (int)partSize);
1423             infoln(line);
1424             infoln(printCollationKey(key));
1425             infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1426             return FALSE;
1427         }
1428     }
1429     return TRUE;
1430 }
1431 
1432 /**
1433  * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1434  * Leaves key unchanged if s does not contain U+FFFE.
1435  * @return TRUE if the key was successfully changed
1436  */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1437 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1438                                            CollationKey &key, IcuTestErrorCode &errorCode) {
1439     if(errorCode.isFailure()) { return FALSE; }
1440     LocalMemory<uint8_t> mergedKey;
1441     int32_t mergedKeyLength = 0;
1442     int32_t mergedKeyCapacity = 0;
1443     int32_t sLength = (length >= 0) ? length : u_strlen(s);
1444     int32_t segmentStart = 0;
1445     for(int32_t i = 0;;) {
1446         if(i == sLength) {
1447             if(segmentStart == 0) {
1448                 // s does not contain any U+FFFE.
1449                 return FALSE;
1450             }
1451         } else if(s[i] != 0xfffe) {
1452             ++i;
1453             continue;
1454         }
1455         // Get the sort key for another segment and merge it into mergedKey.
1456         CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1457         CollationKey key2;
1458         coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1459         int32_t key1Length, key2Length;
1460         const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1461         const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1462         uint8_t *dest;
1463         int32_t minCapacity = key1Length + key2Length;
1464         if(key1Length > 0) { --minCapacity; }
1465         if(minCapacity <= mergedKeyCapacity) {
1466             dest = mergedKey.getAlias();
1467         } else {
1468             if(minCapacity <= 200) {
1469                 mergedKeyCapacity = 200;
1470             } else if(minCapacity <= 2 * mergedKeyCapacity) {
1471                 mergedKeyCapacity *= 2;
1472             } else {
1473                 mergedKeyCapacity = minCapacity;
1474             }
1475             dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1476         }
1477         U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1478         if(key1Length == 0) {
1479             // key2 is the sort key for the first segment.
1480             uprv_memcpy(dest, key2Bytes, key2Length);
1481             mergedKeyLength = key2Length;
1482         } else {
1483             mergedKeyLength =
1484                 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1485                                    dest, mergedKeyCapacity);
1486         }
1487         if(i == sLength) { break; }
1488         segmentStart = ++i;
1489     }
1490     key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1491     return TRUE;
1492 }
1493 
1494 namespace {
1495 
1496 /**
1497  * Replaces unpaired surrogates with U+FFFD.
1498  * Returns s if no replacement was made, otherwise buffer.
1499  */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1500 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1501     int32_t i = 0;
1502     while(i < s.length()) {
1503         UChar32 c = s.char32At(i);
1504         if(U_IS_SURROGATE(c)) {
1505             if(buffer.length() < i) {
1506                 buffer.append(s, buffer.length(), i - buffer.length());
1507             }
1508             buffer.append((UChar)0xfffd);
1509         }
1510         i += U16_LENGTH(c);
1511     }
1512     if(buffer.isEmpty()) {
1513         return s;
1514     }
1515     if(buffer.length() < i) {
1516         buffer.append(s, buffer.length(), i - buffer.length());
1517     }
1518     return buffer;
1519 }
1520 
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1521 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1522                            UCollationResult order, UBool collHasCaseLevel) {
1523     if(order == UCOL_EQUAL) {
1524         return Collation::NO_LEVEL;
1525     }
1526     int32_t prevKeyLength;
1527     const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1528     int32_t keyLength;
1529     const uint8_t *bytes = key.getByteArray(keyLength);
1530     int32_t level = Collation::PRIMARY_LEVEL;
1531     for(int32_t i = 0;; ++i) {
1532         uint8_t b = prevBytes[i];
1533         if(b != bytes[i]) { break; }
1534         if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1535             ++level;
1536             if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1537                 ++level;
1538             }
1539         }
1540     }
1541     return level;
1542 }
1543 
1544 }
1545 
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1546 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1547                                      const UnicodeString &prevString, const UnicodeString &s,
1548                                      UCollationResult expectedOrder, Collation::Level expectedLevel,
1549                                      IcuTestErrorCode &errorCode) {
1550     if(errorCode.isFailure()) { return FALSE; }
1551 
1552     // Get the sort keys first, for error debug output.
1553     CollationKey prevKey;
1554     if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1555                         prevKey, errorCode)) {
1556         return FALSE;
1557     }
1558     CollationKey key;
1559     if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1560 
1561     UCollationResult order = coll->compare(prevString, s, errorCode);
1562     if(order != expectedOrder || errorCode.isFailure()) {
1563         infoln(fileTestName);
1564         errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1565               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1566         infoln(prevFileLine);
1567         infoln(fileLine);
1568         infoln(printCollationKey(prevKey));
1569         infoln(printCollationKey(key));
1570         return FALSE;
1571     }
1572     order = coll->compare(s, prevString, errorCode);
1573     if(order != -expectedOrder || errorCode.isFailure()) {
1574         infoln(fileTestName);
1575         errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1576               (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1577         infoln(prevFileLine);
1578         infoln(fileLine);
1579         infoln(printCollationKey(prevKey));
1580         infoln(printCollationKey(key));
1581         return FALSE;
1582     }
1583     // Test NUL-termination if the strings do not contain NUL characters.
1584     UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1585     if(!containNUL) {
1586         order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1587         if(order != expectedOrder || errorCode.isFailure()) {
1588             infoln(fileTestName);
1589             errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1590                   (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1591             infoln(prevFileLine);
1592             infoln(fileLine);
1593             infoln(printCollationKey(prevKey));
1594             infoln(printCollationKey(key));
1595             return FALSE;
1596         }
1597         order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1598         if(order != -expectedOrder || errorCode.isFailure()) {
1599             infoln(fileTestName);
1600             errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1601                   (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1602             infoln(prevFileLine);
1603             infoln(fileLine);
1604             infoln(printCollationKey(prevKey));
1605             infoln(printCollationKey(key));
1606             return FALSE;
1607         }
1608     }
1609 
1610 #if U_HAVE_STD_STRING
1611     // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1612     // Unpaired surrogates cannot be converted to UTF-8.
1613     // Create valid UTF-16 strings if necessary, and use those for
1614     // both the expected compare() result and for the input to compare(UTF-8).
1615     UnicodeString prevBuffer, sBuffer;
1616     const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1617     const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1618     std::string prevUTF8, sUTF8;
1619     UnicodeString(prevValid).toUTF8String(prevUTF8);
1620     UnicodeString(sValid).toUTF8String(sUTF8);
1621     UCollationResult expectedUTF8Order;
1622     if(&prevValid == &prevString && &sValid == &s) {
1623         expectedUTF8Order = expectedOrder;
1624     } else {
1625         expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1626     }
1627 
1628     order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1629     if(order != expectedUTF8Order || errorCode.isFailure()) {
1630         infoln(fileTestName);
1631         errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1632               (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1633         infoln(prevFileLine);
1634         infoln(fileLine);
1635         infoln(printCollationKey(prevKey));
1636         infoln(printCollationKey(key));
1637         return FALSE;
1638     }
1639     order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1640     if(order != -expectedUTF8Order || errorCode.isFailure()) {
1641         infoln(fileTestName);
1642         errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1643               (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1644         infoln(prevFileLine);
1645         infoln(fileLine);
1646         infoln(printCollationKey(prevKey));
1647         infoln(printCollationKey(key));
1648         return FALSE;
1649     }
1650     // Test NUL-termination if the strings do not contain NUL characters.
1651     if(!containNUL) {
1652         order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1653         if(order != expectedUTF8Order || errorCode.isFailure()) {
1654             infoln(fileTestName);
1655             errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1656                   (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1657             infoln(prevFileLine);
1658             infoln(fileLine);
1659             infoln(printCollationKey(prevKey));
1660             infoln(printCollationKey(key));
1661             return FALSE;
1662         }
1663         order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1664         if(order != -expectedUTF8Order || errorCode.isFailure()) {
1665             infoln(fileTestName);
1666             errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1667                   (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1668             infoln(prevFileLine);
1669             infoln(fileLine);
1670             infoln(printCollationKey(prevKey));
1671             infoln(printCollationKey(key));
1672             return FALSE;
1673         }
1674     }
1675 #endif
1676 
1677     UCharIterator leftIter;
1678     UCharIterator rightIter;
1679     uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1680     uiter_setString(&rightIter, s.getBuffer(), s.length());
1681     order = coll->compare(leftIter, rightIter, errorCode);
1682     if(order != expectedOrder || errorCode.isFailure()) {
1683         infoln(fileTestName);
1684         errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1685               "wrong order: %d != %d (%s)",
1686               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1687         infoln(prevFileLine);
1688         infoln(fileLine);
1689         infoln(printCollationKey(prevKey));
1690         infoln(printCollationKey(key));
1691         return FALSE;
1692     }
1693 
1694     order = prevKey.compareTo(key, errorCode);
1695     if(order != expectedOrder || errorCode.isFailure()) {
1696         infoln(fileTestName);
1697         errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1698               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1699         infoln(prevFileLine);
1700         infoln(fileLine);
1701         infoln(printCollationKey(prevKey));
1702         infoln(printCollationKey(key));
1703         return FALSE;
1704     }
1705     UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1706     int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1707     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1708         if(level != expectedLevel) {
1709             infoln(fileTestName);
1710             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1711                   (int)fileLineNumber, norm, order, level, expectedLevel);
1712             infoln(prevFileLine);
1713             infoln(fileLine);
1714             infoln(printCollationKey(prevKey));
1715             infoln(printCollationKey(key));
1716             return FALSE;
1717         }
1718     }
1719 
1720     // If either string contains U+FFFE, then their sort keys must compare the same as
1721     // the merged sort keys of each string's between-FFFE segments.
1722     //
1723     // It is not required that
1724     //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1725     // only that those two methods yield the same order.
1726     //
1727     // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1728     if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1729                 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1730             errorCode.isFailure()) {
1731         order = prevKey.compareTo(key, errorCode);
1732         if(order != expectedOrder || errorCode.isFailure()) {
1733             infoln(fileTestName);
1734             errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1735                 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1736                 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1737             infoln(prevFileLine);
1738             infoln(fileLine);
1739             infoln(printCollationKey(prevKey));
1740             infoln(printCollationKey(key));
1741             return FALSE;
1742         }
1743         int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1744         if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1745             if(mergedLevel != level) {
1746                 infoln(fileTestName);
1747                 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1748                     "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1749                     (int)fileLineNumber, norm, order, mergedLevel, level);
1750                 infoln(prevFileLine);
1751                 infoln(fileLine);
1752                 infoln(printCollationKey(prevKey));
1753                 infoln(printCollationKey(key));
1754                 return FALSE;
1755             }
1756         }
1757     }
1758     return TRUE;
1759 }
1760 
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1761 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1762     if(errorCode.isFailure()) { return; }
1763     UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1764     UnicodeString prevString, s;
1765     prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1766     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1767         // Parse the line even if it will be ignored (when we do not have a Collator)
1768         // in order to report syntax issues.
1769         Collation::Level relation = parseRelationAndString(s, errorCode);
1770         if(errorCode.isFailure()) {
1771             errorCode.reset();
1772             break;
1773         }
1774         if(coll == NULL) {
1775             // We were unable to create the Collator but continue with tests.
1776             // Ignore test data for this Collator.
1777             // The next Collator creation might work.
1778             continue;
1779         }
1780         UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1781         Collation::Level expectedLevel = relation;
1782         s.getTerminatedBuffer();  // Ensure NUL-termination.
1783         UBool isOk = TRUE;
1784         if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1785             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1786             isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1787                                    expectedOrder, expectedLevel, errorCode);
1788         }
1789         if(isOk) {
1790             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1791             isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1792                                    expectedOrder, expectedLevel, errorCode);
1793         }
1794         if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1795             UnicodeString pn = nfd->normalize(prevString, errorCode);
1796             UnicodeString n = nfd->normalize(s, errorCode);
1797             pn.getTerminatedBuffer();
1798             n.getTerminatedBuffer();
1799             errorCode.assertSuccess();
1800             isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1801                                    expectedOrder, expectedLevel, errorCode);
1802         }
1803         if(!isOk) {
1804             errorCode.reset();  // already reported
1805         }
1806         prevFileLine = fileLine;
1807         prevString = s;
1808         prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1809     }
1810 }
1811 
TestDataDriven()1812 void CollationTest::TestDataDriven() {
1813     IcuTestErrorCode errorCode(*this, "TestDataDriven");
1814 
1815     fcd = Normalizer2Factory::getFCDInstance(errorCode);
1816     nfd = Normalizer2::getNFDInstance(errorCode);
1817     if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1818         return;
1819     }
1820 
1821     CharString path(getSourceTestData(errorCode), errorCode);
1822     path.appendPathPart("collationtest.txt", errorCode);
1823     const char *codePage = "UTF-8";
1824     LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1825     if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1826         return;
1827     }
1828     // Read a new line if necessary.
1829     // Sub-parsers leave the first line set that they do not handle.
1830     while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1831         if(!isSectionStarter(fileLine[0])) {
1832             errln("syntax error on line %d", (int)fileLineNumber);
1833             infoln(fileLine);
1834             return;
1835         }
1836         if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1837             fileTestName = fileLine;
1838             logln(fileLine);
1839             fileLine.remove();
1840         } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1841             setRootCollator(errorCode);
1842             fileLine.remove();
1843         } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1844             setLocaleCollator(errorCode);
1845             fileLine.remove();
1846         } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1847             buildTailoring(f.getAlias(), errorCode);
1848         } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1849             parseAndSetAttribute(errorCode);
1850         } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1851             checkCompareStrings(f.getAlias(), errorCode);
1852         } else {
1853             errln("syntax error on line %d", (int)fileLineNumber);
1854             infoln(fileLine);
1855             return;
1856         }
1857     }
1858 }
1859 
1860 #endif  // !UCONFIG_NO_COLLATION
1861