• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/stringpiece.h"
26 #include "unicode/tblcoll.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uniset.h"
29 #include "unicode/unistr.h"
30 #include "unicode/usetiter.h"
31 #include "unicode/ustring.h"
32 #include "charstr.h"
33 #include "cmemory.h"
34 #include "collation.h"
35 #include "collationdata.h"
36 #include "collationfcd.h"
37 #include "collationiterator.h"
38 #include "collationroot.h"
39 #include "collationrootelements.h"
40 #include "collationruleparser.h"
41 #include "collationweights.h"
42 #include "cstring.h"
43 #include "intltest.h"
44 #include "normalizer2impl.h"
45 #include "ucbuf.h"
46 #include "uhash.h"
47 #include "uitercollationiterator.h"
48 #include "utf16collationiterator.h"
49 #include "utf8collationiterator.h"
50 #include "uvectr32.h"
51 #include "uvectr64.h"
52 #include "writesrc.h"
53 
54 class CodePointIterator;
55 
56 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
57 
58 class CollationTest : public IntlTest {
59 public:
CollationTest()60     CollationTest()
61             : fcd(nullptr), nfd(nullptr),
62               fileLineNumber(0),
63               coll(nullptr) {}
64 
~CollationTest()65     ~CollationTest() {
66         delete coll;
67     }
68 
69     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override;
70 
71     void TestMinMax();
72     void TestImplicits();
73     void TestNulTerminated();
74     void TestIllegalUTF8();
75     void TestShortFCDData();
76     void TestFCD();
77     void TestCollationWeights();
78     void TestRootElements();
79     void TestTailoredElements();
80     void TestDataDriven();
81     void TestLongLocale();
82     void TestBuilderContextsOverflow();
83     void TestHang22414();
84 
85 private:
86     void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
87     void checkAllocWeights(CollationWeights &cw,
88                            uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
89                            int32_t someLength, int32_t minCount);
90 
91     static UnicodeString printSortKey(const uint8_t *p, int32_t length);
92     static UnicodeString printCollationKey(const CollationKey &key);
93 
94     // Helpers & fields for data-driven test.
isCROrLF(char16_t c)95     static UBool isCROrLF(char16_t c) { return c == 0xa || c == 0xd; }
isSpace(char16_t c)96     static UBool isSpace(char16_t c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(char16_t c)97     static UBool isSectionStarter(char16_t c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
skipSpaces(int32_t i)98     int32_t skipSpaces(int32_t i) {
99         while(isSpace(fileLine[i])) { ++i; }
100         return i;
101     }
102 
103     UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
104     void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
105     Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
106     void parseAndSetAttribute(IcuTestErrorCode &errorCode);
107     void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
108     void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
109     void setRootCollator(IcuTestErrorCode &errorCode);
110     void setLocaleCollator(IcuTestErrorCode &errorCode);
111 
112     UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
113 
114     UBool getSortKeyParts(const char16_t *s, int32_t length,
115                           CharString &dest, int32_t partSize,
116                           IcuTestErrorCode &errorCode);
117     UBool getCollationKey(const char *norm, const UnicodeString &line,
118                           const char16_t *s, int32_t length,
119                           CollationKey &key, IcuTestErrorCode &errorCode);
120     UBool getMergedCollationKey(const char16_t *s, int32_t length,
121                                 CollationKey &key, IcuTestErrorCode &errorCode);
122     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
123                           const UnicodeString &prevString, const UnicodeString &s,
124                           UCollationResult expectedOrder, Collation::Level expectedLevel,
125                           IcuTestErrorCode &errorCode);
126     void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
127 
128     const Normalizer2 *fcd, *nfd;
129     UnicodeString fileLine;
130     int32_t fileLineNumber;
131     UnicodeString fileTestName;
132     Collator *coll;
133 };
134 
createCollationTest()135 extern IntlTest *createCollationTest() {
136     return new CollationTest();
137 }
138 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)139 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
140     if(exec) {
141         logln("TestSuite CollationTest: ");
142     }
143     TESTCASE_AUTO_BEGIN;
144     TESTCASE_AUTO(TestMinMax);
145     TESTCASE_AUTO(TestImplicits);
146     TESTCASE_AUTO(TestNulTerminated);
147     TESTCASE_AUTO(TestIllegalUTF8);
148     TESTCASE_AUTO(TestShortFCDData);
149     TESTCASE_AUTO(TestFCD);
150     TESTCASE_AUTO(TestCollationWeights);
151     TESTCASE_AUTO(TestRootElements);
152     TESTCASE_AUTO(TestTailoredElements);
153     TESTCASE_AUTO(TestDataDriven);
154     TESTCASE_AUTO(TestLongLocale);
155     TESTCASE_AUTO(TestBuilderContextsOverflow);
156     TESTCASE_AUTO(TestHang22414);
157     TESTCASE_AUTO_END;
158 }
159 
TestMinMax()160 void CollationTest::TestMinMax() {
161     IcuTestErrorCode errorCode(*this, "TestMinMax");
162 
163     setRootCollator(errorCode);
164     if(errorCode.isFailure()) {
165         errorCode.reset();
166         return;
167     }
168     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
169     if(rbc == nullptr) {
170         errln("the root collator is not a RuleBasedCollator");
171         return;
172     }
173 
174     static const char16_t s[2] = { 0xfffe, 0xffff };
175     UVector64 ces(errorCode);
176     rbc->internalGetCEs(UnicodeString(false, s, 2), ces, errorCode);
177     errorCode.assertSuccess();
178     if(ces.size() != 2) {
179         errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
180         return;
181     }
182     int64_t ce = ces.elementAti(0);
183     int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
184     if(ce != expected) {
185         errln("CE(U+fffe)=%04lx != 02..", (long)ce);
186     }
187 
188     ce = ces.elementAti(1);
189     expected = Collation::makeCE(Collation::MAX_PRIMARY);
190     if(ce != expected) {
191         errln("CE(U+ffff)=%04lx != max..", (long)ce);
192     }
193 }
194 
TestImplicits()195 void CollationTest::TestImplicits() {
196     IcuTestErrorCode errorCode(*this, "TestImplicits");
197 
198     const CollationData *cd = CollationRoot::getData(errorCode);
199     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
200         return;
201     }
202 
203     // Implicit primary weights should be assigned for the following sets,
204     // and sort in ascending order by set and then code point.
205     // See https://www.unicode.org/reports/tr10/#Implicit_Weights
206 
207     // core Han Unified Ideographs
208     UnicodeSet coreHan("[\\p{unified_ideograph}&"
209                             "[\\p{Block=CJK_Unified_Ideographs}"
210                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
211                        errorCode);
212     // all other Unified Han ideographs
213     UnicodeSet otherHan("[\\p{unified ideograph}-"
214                             "[\\p{Block=CJK_Unified_Ideographs}"
215                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
216                         errorCode);
217     UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
218     unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
219 
220     // Starting with CLDR 26/ICU 54, the root Han order may instead be
221     // the Unihan radical-stroke order.
222     // The tests should pass either way, so we only test the order of a small set of Han characters
223     // whose radical-stroke order is the same as their code point order.
224     //
225     // When the radical-stroke data (kRSUnicode) for one of these characters changes
226     // such that it no longer sorts in code point order,
227     // then we need to remove it from this set.
228     // (These changes are easiest to see in the change history of the Unicode Tools file
229     // unicodetools/data/ucd/dev/Unihan/kRSUnicode.txt.)
230     // For example, in Unicode 15.1, U+503B has a kRSUnicode value of 9.9
231     // while the neighboring characters still have 9.8. We remove the out-of-order U+503B.
232     //
233     // FYI: The Unicode Tools program GenerateUnihanCollators prints something like
234     // hanInCPOrder = [一-世丘-丫中-丼举-么乊-习乣-亏...鼢-齡齣-龏龑-龥]
235     // number of original-Unihan characters out of order: 318
236     UnicodeSet someHanInCPOrder(
237             u"[\u4E00-\u4E16\u4E18-\u4E2B\u4E2D-\u4E3C\u4E3E-\u4E48"
238             u"\u4E4A-\u4E60\u4E63-\u4E8F\u4E91-\u4F63\u4F65-\u503A\u503C-\u50F1\u50F3-\u50F6]",
239             errorCode);
240     UnicodeSet inOrder(someHanInCPOrder);
241     inOrder.addAll(unassigned).freeze();
242     if(errorCode.errIfFailureAndReset("UnicodeSet")) {
243         return;
244     }
245     const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
246     const char *const setNames[] = { "core Han", "Han extensions", "unassigned" };
247     UChar32 prev = 0;
248     uint32_t prevPrimary = 0;
249     UTF16CollationIterator ci(cd, false, nullptr, nullptr, nullptr);
250     for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
251         const char *setName = setNames[i];
252         LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
253         while(iter->next()) {
254             UChar32 c = iter->getCodepoint();
255             UnicodeString s(c);
256             ci.setText(s.getBuffer(), s.getBuffer() + s.length());
257             int64_t ce = ci.nextCE(errorCode);
258             int64_t ce2 = ci.nextCE(errorCode);
259             if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
260                 return;
261             }
262             if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
263                 errln("%s: CollationIterator.nextCE(U+%04lx) did not yield exactly one CE",
264                       setName, (long)c);
265                 continue;
266             }
267             if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
268                 errln("%s: CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
269                       setName, (long)c, (long)(ce & 0xffffffff));
270                 continue;
271             }
272             uint32_t primary = (uint32_t)(ce >> 32);
273             if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
274                 errln("%s: CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
275                       setName, (long)c, (long)primary, (long)prev, (long)prevPrimary);
276             }
277             prev = c;
278             prevPrimary = primary;
279         }
280     }
281 }
282 
TestNulTerminated()283 void CollationTest::TestNulTerminated() {
284     IcuTestErrorCode errorCode(*this, "TestNulTerminated");
285     const CollationData *data = CollationRoot::getData(errorCode);
286     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
287         return;
288     }
289 
290     static const char16_t s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
291 
292     UTF16CollationIterator ci1(data, false, s, s, s + 2);
293     UTF16CollationIterator ci2(data, false, s + 2, s + 2, nullptr);
294     for(int32_t i = 0;; ++i) {
295         int64_t ce1 = ci1.nextCE(errorCode);
296         int64_t ce2 = ci2.nextCE(errorCode);
297         if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
298             return;
299         }
300         if(ce1 != ce2) {
301             errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
302             break;
303         }
304         if(ce1 == Collation::NO_CE) { break; }
305     }
306 }
307 
TestIllegalUTF8()308 void CollationTest::TestIllegalUTF8() {
309     IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
310 
311     setRootCollator(errorCode);
312     if(errorCode.isFailure()) {
313         errorCode.reset();
314         return;
315     }
316     coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
317 
318     static const StringPiece strings[] = {
319         // string with U+FFFD == illegal byte sequence
320         u8"a\uFFFDz",                   "a\x80z",  // trail byte
321         u8"a\uFFFD\uFFFDz",             "a\xc1\x81z",  // non-shortest form
322         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xe0\x82\x83z",  // non-shortest form
323         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
324         u8"a\uFFFD\uFFFD\uFFFDz",       "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
325         u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz",  // non-shortest form
326         u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
327     };
328 
329     for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
330         StringPiece fffd(strings[i]);
331         StringPiece illegal(strings[i + 1]);
332         UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
333         if(order != UCOL_EQUAL) {
334             errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
335                   (int)i, order);
336         }
337     }
338 }
339 
340 namespace {
341 
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)342 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
343     for(UChar32 c = 0x10000; c < 0x110000;) {
344         UChar32 next = c + 0x400;
345         if(src.containsSome(c, next - 1)) {
346             dest.add(U16_LEAD(c));
347         }
348         c = next;
349     }
350 }
351 
352 }  // namespace
353 
TestShortFCDData()354 void CollationTest::TestShortFCDData() {
355     // See CollationFCD class comments.
356     IcuTestErrorCode errorCode(*this, "TestShortFCDData");
357     UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
358     errorCode.assertSuccess();
359     expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
360     addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
361     UnicodeSet lccc;  // actual
362     for(UChar32 c = 0; c <= 0xffff; ++c) {
363         if(CollationFCD::hasLccc(c)) { lccc.add(c); }
364     }
365     UnicodeSet diff(expectedLccc);
366     diff.removeAll(lccc);
367     diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
368     UnicodeString empty("[]");
369     UnicodeString diffString;
370     diff.toPattern(diffString, true);
371     assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
372     diff = lccc;
373     diff.removeAll(expectedLccc);
374     diff.toPattern(diffString, true);
375     assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, true);
376 
377     UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
378     if (errorCode.isSuccess()) {
379         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
380         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
381         UnicodeSet tccc;  // actual
382         for(UChar32 c = 0; c <= 0xffff; ++c) {
383             if(CollationFCD::hasTccc(c)) { tccc.add(c); }
384         }
385         diff = expectedTccc;
386         diff.removeAll(tccc);
387         diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
388         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
389         diff = tccc;
390         diff.removeAll(expectedTccc);
391         diff.toPattern(diffString, true);
392         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
393     }
394 }
395 
396 class CodePointIterator {
397 public:
CodePointIterator(const UChar32 * cp,int32_t length)398     CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()399     void resetToStart() { pos = 0; }
next()400     UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()401     UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const402     int32_t getLength() const { return length; }
getIndex() const403     int getIndex() const { return (int)pos; }
404 private:
405     const UChar32 *cp;
406     int32_t length;
407     int32_t pos;
408 };
409 
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)410 void CollationTest::checkFCD(const char *name,
411                              CollationIterator &ci, CodePointIterator &cpi) {
412     IcuTestErrorCode errorCode(*this, "checkFCD");
413 
414     // Iterate forward to the limit.
415     for(;;) {
416         UChar32 c1 = ci.nextCodePoint(errorCode);
417         UChar32 c2 = cpi.next();
418         if(c1 != c2) {
419             errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
420                   name, (long)c1, (long)c2, cpi.getIndex());
421             return;
422         }
423         if(c1 < 0) { break; }
424     }
425 
426     // Iterate backward most of the way.
427     for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
428         UChar32 c1 = ci.previousCodePoint(errorCode);
429         UChar32 c2 = cpi.previous();
430         if(c1 != c2) {
431             errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
432                   name, (long)c1, (long)c2, cpi.getIndex());
433             return;
434         }
435     }
436 
437     // Forward again.
438     for(;;) {
439         UChar32 c1 = ci.nextCodePoint(errorCode);
440         UChar32 c2 = cpi.next();
441         if(c1 != c2) {
442             errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
443                   name, (long)c1, (long)c2, cpi.getIndex());
444             return;
445         }
446         if(c1 < 0) { break; }
447     }
448 
449     // Iterate backward to the start.
450     for(;;) {
451         UChar32 c1 = ci.previousCodePoint(errorCode);
452         UChar32 c2 = cpi.previous();
453         if(c1 != c2) {
454             errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
455                   name, (long)c1, (long)c2, cpi.getIndex());
456             return;
457         }
458         if(c1 < 0) { break; }
459     }
460 }
461 
TestFCD()462 void CollationTest::TestFCD() {
463     IcuTestErrorCode errorCode(*this, "TestFCD");
464     const CollationData *data = CollationRoot::getData(errorCode);
465     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
466         return;
467     }
468 
469     // Input string, not FCD, NUL-terminated.
470     static const char16_t s[] = {
471         0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
472         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
473         0x327, 0x308,  // ccc=202, 230
474         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
475         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
476         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
477         0xac01,
478         0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
479         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
480         0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
481         0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
482         0x4e00, 0xf81,
483         0
484     };
485     // Expected code points.
486     static const UChar32 cp[] = {
487         0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
488         0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
489         0x1D15F, 0x1D16D,
490         0xac01,
491         0x63, 0x327, 0x1D165, 0x1D16D,
492         0x61,
493         0xf71, 0xf71, 0xf72, 0xf74, 0x301,
494         0x4e00, 0xf71, 0xf80
495     };
496 
497     FCDUTF16CollationIterator u16ci(data, false, s, s, nullptr);
498     if(errorCode.errIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
499         return;
500     }
501     CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
502     checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
503 
504     cpi.resetToStart();
505     std::string utf8;
506     UnicodeString(s).toUTF8String(utf8);
507     FCDUTF8CollationIterator u8ci(data, false,
508                                   reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
509     if(errorCode.errIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
510         return;
511     }
512     checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
513 
514     cpi.resetToStart();
515     UCharIterator iter;
516     uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL
517     FCDUIterCollationIterator uici(data, false, iter, 0);
518     if(errorCode.errIfFailureAndReset("FCDUIterCollationIterator constructor")) {
519         return;
520     }
521     checkFCD("FCDUIterCollationIterator", uici, cpi);
522 }
523 
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)524 void CollationTest::checkAllocWeights(CollationWeights &cw,
525                                       uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
526                                       int32_t someLength, int32_t minCount) {
527     if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
528         errln("CollationWeights::allocWeights(%lx, %lx, %ld) = false",
529               (long)lowerLimit, (long)upperLimit, (long)n);
530         return;
531     }
532     uint32_t previous = lowerLimit;
533     int32_t count = 0;  // number of weights that have someLength
534     for(int32_t i = 0; i < n; ++i) {
535         uint32_t w = cw.nextWeight();
536         if(w == 0xffffffff) {
537             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
538                   "returns only %ld weights",
539                   (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
540             return;
541         }
542         if(!(previous < w && w < upperLimit)) {
543             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
544                   "number %ld -> %lx not between %lx and %lx",
545                   (long)lowerLimit, (long)upperLimit, (long)n,
546                   (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
547             return;
548         }
549         if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
550     }
551     if(count < minCount) {
552         errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
553               "returns only %ld < %ld weights of length %d",
554               (long)lowerLimit, (long)upperLimit, (long)n,
555               (long)count, (long)minCount, (int)someLength);
556     }
557 }
558 
TestCollationWeights()559 void CollationTest::TestCollationWeights() {
560     CollationWeights cw;
561 
562     // Non-compressible primaries use 254 second bytes 02..FF.
563     logln("CollationWeights.initForPrimary(non-compressible)");
564     cw.initForPrimary(false);
565     // Expect 1 weight 11 and 254 weights 12xx.
566     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
567     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
568     // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
569     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
570     // Expect 254 two-byte weights from the ranges 10ff and 11xx.
571     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
572     // Expect 254^2=64516 three-byte weights.
573     // During computation, there should be 3 three-byte ranges
574     // 10ffff, 11xxxx, 120202.
575     // The middle one should be split 64515:1,
576     // and the newly-split-off range and the last ranged lengthened.
577     checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
578     // Expect weights 1102 & 1103.
579     checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
580     // Expect weights 102102 & 102103.
581     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
582 
583     // Compressible primaries use 251 second bytes 04..FE.
584     logln("CollationWeights.initForPrimary(compressible)");
585     cw.initForPrimary(true);
586     // Expect 1 weight 11 and 251 weights 12xx.
587     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
588     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
589     // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
590     checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
591     // Expect weights 1104 & 1105.
592     checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
593     // Expect weights 102102 & 102103.
594     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
595 
596     // Secondary and tertiary weights use only bytes 3 & 4.
597     logln("CollationWeights.initForSecondary()");
598     cw.initForSecondary();
599     // Expect weights fbxx and all four fc..ff.
600     checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
601 
602     logln("CollationWeights.initForTertiary()");
603     cw.initForTertiary();
604     // Expect weights 3dxx and both 3e & 3f.
605     checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
606 }
607 
608 namespace {
609 
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)610 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
611                 uint32_t p, uint32_t s, uint32_t ctq) {
612     uint32_t p1 = p >> 24;
613     uint32_t p2 = (p >> 16) & 0xff;
614     uint32_t p3 = (p >> 8) & 0xff;
615     uint32_t p4 = p & 0xff;
616     uint32_t s1 = s >> 8;
617     uint32_t s2 = s & 0xff;
618     // ctq = Case, Tertiary, Quaternary
619     uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
620     uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
621     uint32_t t1 = t >> 8;
622     uint32_t t2 = t & 0xff;
623     uint32_t q = ctq & Collation::QUATERNARY_MASK;
624     // No leading zero bytes.
625     if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
626         return false;
627     }
628     // No intermediate zero bytes.
629     if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
630         return false;
631     }
632     if(p2 != 0 && p3 == 0 && p4 != 0) {
633         return false;
634     }
635     // Minimum & maximum lead bytes.
636     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
637             s1 == Collation::LEVEL_SEPARATOR_BYTE ||
638             t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
639         return false;
640     }
641     if(c > 2) {
642         return false;
643     }
644     // The valid byte range for the second primary byte depends on compressibility.
645     if(p2 != 0) {
646         if(data.isCompressibleLeadByte(p1)) {
647             if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
648                     Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
649                 return false;
650             }
651         } else {
652             if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
653                 return false;
654             }
655         }
656     }
657     // Other bytes just need to avoid the level separator.
658     // Trailing zeros are ok.
659     U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
660     if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
661             s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
662         return false;
663     }
664     // Well-formed CEs.
665     if(p == 0) {
666         if(s == 0) {
667             if(t == 0) {
668                 // Completely ignorable CE.
669                 // Quaternary CEs are not supported.
670                 if(c != 0 || q != 0) {
671                     return false;
672                 }
673             } else {
674                 // Tertiary CE.
675                 if(t < re.getTertiaryBoundary() || c != 2) {
676                     return false;
677                 }
678             }
679         } else {
680             // Secondary CE.
681             if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
682                 return false;
683             }
684         }
685     } else {
686         // Primary CE.
687         if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
688                 s >= re.getSecondaryBoundary()) {
689             return false;
690         }
691         if(t == 0 || t >= re.getTertiaryBoundary()) {
692             return false;
693         }
694     }
695     return true;
696 }
697 
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)698 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
699     uint32_t p = (uint32_t)(ce >> 32);
700     uint32_t secTer = (uint32_t)ce;
701     return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
702 }
703 
704 class RootElementsIterator {
705 public:
RootElementsIterator(const CollationData & root)706     RootElementsIterator(const CollationData &root)
707             : data(root),
708               elements(root.rootElements), length(root.rootElementsLength),
709               pri(0), secTer(0),
710               index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
711 
next()712     UBool next() {
713         if(index >= length) { return false; }
714         uint32_t p = elements[index];
715         if(p == CollationRootElements::PRIMARY_SENTINEL) { return false; }
716         if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
717             ++index;
718             secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
719             return true;
720         }
721         if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
722             // End of a range, enumerate the primaries in the range.
723             int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
724             p &= 0xffffff00;
725             if(pri == p) {
726                 // Finished the range, return the next CE after it.
727                 ++index;
728                 return next();
729             }
730             U_ASSERT(pri < p);
731             // Return the next primary in this range.
732             UBool isCompressible = data.isCompressiblePrimary(pri);
733             if((pri & 0xffff) == 0) {
734                 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
735             } else {
736                 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
737             }
738             return true;
739         }
740         // Simple primary CE.
741         ++index;
742         pri = p;
743         // Does this have an explicit below-common sec/ter unit,
744         // or does it imply a common one?
745         if(index == length) {
746             secTer = Collation::COMMON_SEC_AND_TER_CE;
747         } else {
748             secTer = elements[index];
749             if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
750                 // No sec/ter delta.
751                 secTer = Collation::COMMON_SEC_AND_TER_CE;
752             } else {
753                 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
754                 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
755                     // Implied sec/ter.
756                     secTer = Collation::COMMON_SEC_AND_TER_CE;
757                 } else {
758                     // Explicit sec/ter below common/common.
759                     ++index;
760                 }
761             }
762         }
763         return true;
764     }
765 
getPrimary() const766     uint32_t getPrimary() const { return pri; }
getSecTer() const767     uint32_t getSecTer() const { return secTer; }
768 
769 private:
770     const CollationData &data;
771     const uint32_t *elements;
772     int32_t length;
773 
774     uint32_t pri;
775     uint32_t secTer;
776     int32_t index;
777 };
778 
779 }  // namespace
780 
TestRootElements()781 void CollationTest::TestRootElements() {
782     IcuTestErrorCode errorCode(*this, "TestRootElements");
783     const CollationData *root = CollationRoot::getData(errorCode);
784     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
785         return;
786     }
787     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
788     RootElementsIterator iter(*root);
789 
790     // We check each root CE for validity,
791     // and we also verify that there is a tailoring gap between each two CEs.
792     CollationWeights cw1c;  // compressible primary weights
793     CollationWeights cw1u;  // uncompressible primary weights
794     CollationWeights cw2;
795     CollationWeights cw3;
796 
797     cw1c.initForPrimary(true);
798     cw1u.initForPrimary(false);
799     cw2.initForSecondary();
800     cw3.initForTertiary();
801 
802     // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
803     // nor the special merge-separator CE for U+FFFE.
804     uint32_t prevPri = 0;
805     uint32_t prevSec = 0;
806     uint32_t prevTer = 0;
807     while(iter.next()) {
808         uint32_t pri = iter.getPrimary();
809         uint32_t secTer = iter.getSecTer();
810         // CollationRootElements CEs must have 0 case and quaternary bits.
811         if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
812             errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
813                   (long)pri, (long)secTer);
814         }
815         uint32_t sec = secTer >> 16;
816         uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
817         uint32_t ctq = ter;
818         if(pri == 0 && sec == 0 && ter != 0) {
819             // Tertiary CEs must have uppercase bits,
820             // but they are not stored in the CollationRootElements.
821             ctq |= 0x8000;
822         }
823         if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
824             errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
825         } else {
826             if(pri != prevPri) {
827                 uint32_t newWeight = 0;
828                 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
829                     // There is currently no tailoring gap after primary ignorables,
830                     // and we forbid tailoring after U+FFFD and U+FFFF.
831                 } else if(root->isCompressiblePrimary(prevPri)) {
832                     if(!cw1c.allocWeights(prevPri, pri, 1)) {
833                         errln("no primary/compressible tailoring gap between %08lx and %08lx",
834                               (long)prevPri, (long)pri);
835                     } else {
836                         newWeight = cw1c.nextWeight();
837                     }
838                 } else {
839                     if(!cw1u.allocWeights(prevPri, pri, 1)) {
840                         errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
841                               (long)prevPri, (long)pri);
842                     } else {
843                         newWeight = cw1u.nextWeight();
844                     }
845                 }
846                 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
847                     errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
848                           (long)prevPri, (long)newWeight, (long)pri);
849                 }
850             } else if(sec != prevSec) {
851                 uint32_t lowerLimit =
852                     prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
853                 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
854                     errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
855                 } else {
856                     uint32_t newWeight = cw2.nextWeight();
857                     if(!(prevSec < newWeight && newWeight < sec)) {
858                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
859                               (long)lowerLimit, (long)newWeight, (long)sec);
860                     }
861                 }
862             } else if(ter != prevTer) {
863                 uint32_t lowerLimit =
864                     prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
865                 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
866                     errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
867                 } else {
868                     uint32_t newWeight = cw3.nextWeight();
869                     if(!(prevTer < newWeight && newWeight < ter)) {
870                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
871                               (long)lowerLimit, (long)newWeight, (long)ter);
872                     }
873                 }
874             } else {
875                 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
876             }
877         }
878         prevPri = pri;
879         prevSec = sec;
880         prevTer = ter;
881     }
882 }
883 
TestTailoredElements()884 void CollationTest::TestTailoredElements() {
885     IcuTestErrorCode errorCode(*this, "TestTailoredElements");
886     const CollationData *root = CollationRoot::getData(errorCode);
887     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
888         return;
889     }
890     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
891 
892     UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, nullptr, errorCode);
893     if(errorCode.errIfFailureAndReset("failed to create a hash table")) {
894         return;
895     }
896     uhash_setKeyDeleter(prevLocales, uprv_free);
897     // TestRootElements() tests the root collator which does not have tailorings.
898     uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
899     uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
900     uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
901 
902     UVector64 ces(errorCode);
903     LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
904     U_ASSERT(locales.isValid());
905     const char *localeID = "root";
906     do {
907         Locale locale(localeID);
908         LocalPointer<StringEnumeration> types(
909                 Collator::getKeywordValuesForLocale("collation", locale, false, errorCode));
910         errorCode.assertSuccess();
911         const char *type;  // first: default type
912         while((type = types->next(nullptr, errorCode)) != nullptr) {
913             if(strncmp(type, "private-", 8) == 0) {
914                 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
915                         localeID, type);
916             }
917             Locale localeWithType(locale);
918             localeWithType.setKeywordValue("collation", type, errorCode);
919             errorCode.assertSuccess();
920             LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
921             if(errorCode.errIfFailureAndReset("Collator::createInstance(%s)",
922                                               localeWithType.getName())) {
923                 continue;
924             }
925             Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
926             if(uhash_geti(prevLocales, actual.getName()) != 0) {
927                 continue;
928             }
929             uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
930             errorCode.assertSuccess();
931             logln("TestTailoredElements(): requested %s -> actual %s",
932                   localeWithType.getName(), actual.getName());
933             RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
934             if(rbc == nullptr) {
935                 continue;
936             }
937             // Note: It would be better to get tailored strings such that we can
938             // identify the prefix, and only get the CEs for the prefix+string,
939             // not also for the prefix.
940             // There is currently no API for that.
941             // It would help in an unusual case where a contraction starting in the prefix
942             // extends past its end, and we do not see the intended mapping.
943             // For example, for a mapping p|st, if there is also a contraction ps,
944             // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
945             LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
946             errorCode.assertSuccess();
947             UnicodeSetIterator iter(*tailored);
948             while(iter.next()) {
949                 const UnicodeString &s = iter.getString();
950                 ces.removeAllElements();
951                 rbc->internalGetCEs(s, ces, errorCode);
952                 errorCode.assertSuccess();
953                 for(int32_t i = 0; i < ces.size(); ++i) {
954                     int64_t ce = ces.elementAti(i);
955                     if(!isValidCE(rootElements, *root, ce)) {
956                         errln("invalid tailored CE %016llx at CE index %d from string:",
957                               (long long)ce, (int)i);
958                         infoln(prettify(s));
959                     }
960                 }
961             }
962         }
963     } while((localeID = locales->next(nullptr, errorCode)) != nullptr);
964     uhash_close(prevLocales);
965 }
966 
printSortKey(const uint8_t * p,int32_t length)967 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
968     UnicodeString s;
969     for(int32_t i = 0; i < length; ++i) {
970         if(i > 0) { s.append((char16_t)0x20); }
971         uint8_t b = p[i];
972         if(b == 0) {
973             s.append((char16_t)0x2e);  // period
974         } else if(b == 1) {
975             s.append((char16_t)0x7c);  // vertical bar
976         } else {
977             appendHex(b, 2, s);
978         }
979     }
980     return s;
981 }
982 
printCollationKey(const CollationKey & key)983 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
984     int32_t length;
985     const uint8_t *p = key.getByteArray(length);
986     return printSortKey(p, length);
987 }
988 
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)989 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
990     for(;;) {
991         int32_t lineLength;
992         const char16_t *line = ucbuf_readline(f, &lineLength, errorCode);
993         if(line == nullptr || errorCode.isFailure()) {
994             fileLine.remove();
995             return false;
996         }
997         ++fileLineNumber;
998         // Strip trailing CR/LF, comments, and spaces.
999         const char16_t *comment = u_memchr(line, 0x23, lineLength);  // '#'
1000         if(comment != nullptr) {
1001             lineLength = (int32_t)(comment - line);
1002         } else {
1003             while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
1004         }
1005         while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
1006         if(lineLength != 0) {
1007             fileLine.setTo(false, line, lineLength);
1008             return true;
1009         }
1010         // Empty line, continue.
1011     }
1012 }
1013 
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)1014 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
1015                                 UErrorCode &errorCode) {
1016     int32_t length = fileLine.length();
1017     int32_t i;
1018     for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
1019     int32_t pipeIndex = fileLine.indexOf((char16_t)0x7c, start, i - start);  // '|'
1020     if(pipeIndex >= 0) {
1021         prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1022         if(prefix.isEmpty()) {
1023             errln("empty prefix on line %d", (int)fileLineNumber);
1024             infoln(fileLine);
1025             errorCode = U_PARSE_ERROR;
1026             return;
1027         }
1028         start = pipeIndex + 1;
1029     } else {
1030         prefix.remove();
1031     }
1032     s = fileLine.tempSubStringBetween(start, i).unescape();
1033     if(s.isEmpty()) {
1034         errln("empty string on line %d", (int)fileLineNumber);
1035         infoln(fileLine);
1036         errorCode = U_PARSE_ERROR;
1037         return;
1038     }
1039     start = i;
1040 }
1041 
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1042 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1043     Collation::Level relation;
1044     int32_t start;
1045     if(fileLine[0] == 0x3c) {  // <
1046         char16_t second = fileLine[1];
1047         start = 2;
1048         switch(second) {
1049         case 0x31:  // <1
1050             relation = Collation::PRIMARY_LEVEL;
1051             break;
1052         case 0x32:  // <2
1053             relation = Collation::SECONDARY_LEVEL;
1054             break;
1055         case 0x33:  // <3
1056             relation = Collation::TERTIARY_LEVEL;
1057             break;
1058         case 0x34:  // <4
1059             relation = Collation::QUATERNARY_LEVEL;
1060             break;
1061         case 0x63:  // <c
1062             relation = Collation::CASE_LEVEL;
1063             break;
1064         case 0x69:  // <i
1065             relation = Collation::IDENTICAL_LEVEL;
1066             break;
1067         default:  // just <
1068             relation = Collation::NO_LEVEL;
1069             start = 1;
1070             break;
1071         }
1072     } else if(fileLine[0] == 0x3d) {  // =
1073         relation = Collation::ZERO_LEVEL;
1074         start = 1;
1075     } else {
1076         start = 0;
1077     }
1078     if(start == 0 || !isSpace(fileLine[start])) {
1079         errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1080         infoln(fileLine);
1081         errorCode.set(U_PARSE_ERROR);
1082         return Collation::NO_LEVEL;
1083     }
1084     start = skipSpaces(start);
1085     UnicodeString prefix;
1086     parseString(start, prefix, s, errorCode);
1087     if(errorCode.isSuccess() && !prefix.isEmpty()) {
1088         errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1089         infoln(fileLine);
1090         errorCode.set(U_PARSE_ERROR);
1091         return Collation::NO_LEVEL;
1092     }
1093     if(start < fileLine.length()) {
1094         errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1095         infoln(fileLine);
1096         errorCode.set(U_PARSE_ERROR);
1097         return Collation::NO_LEVEL;
1098     }
1099     return relation;
1100 }
1101 
1102 static const struct {
1103     const char *name;
1104     UColAttribute attr;
1105 } attributes[] = {
1106     { "backwards", UCOL_FRENCH_COLLATION },
1107     { "alternate", UCOL_ALTERNATE_HANDLING },
1108     { "caseFirst", UCOL_CASE_FIRST },
1109     { "caseLevel", UCOL_CASE_LEVEL },
1110     // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1111     { "strength", UCOL_STRENGTH },
1112     // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1113     { "numeric", UCOL_NUMERIC_COLLATION }
1114 };
1115 
1116 static const struct {
1117     const char *name;
1118     UColAttributeValue value;
1119 } attributeValues[] = {
1120     { "default", UCOL_DEFAULT },
1121     { "primary", UCOL_PRIMARY },
1122     { "secondary", UCOL_SECONDARY },
1123     { "tertiary", UCOL_TERTIARY },
1124     { "quaternary", UCOL_QUATERNARY },
1125     { "identical", UCOL_IDENTICAL },
1126     { "off", UCOL_OFF },
1127     { "on", UCOL_ON },
1128     { "shifted", UCOL_SHIFTED },
1129     { "non-ignorable", UCOL_NON_IGNORABLE },
1130     { "lower", UCOL_LOWER_FIRST },
1131     { "upper", UCOL_UPPER_FIRST }
1132 };
1133 
parseAndSetAttribute(IcuTestErrorCode & errorCode)1134 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1135     // Parse attributes even if the Collator could not be created,
1136     // in order to report syntax errors.
1137     int32_t start = skipSpaces(1);
1138     int32_t equalPos = fileLine.indexOf((char16_t)0x3d);
1139     if(equalPos < 0) {
1140         if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1141             parseAndSetReorderCodes(start + 7, errorCode);
1142             return;
1143         }
1144         errln("missing '=' on line %d", (int)fileLineNumber);
1145         infoln(fileLine);
1146         errorCode.set(U_PARSE_ERROR);
1147         return;
1148     }
1149 
1150     UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1151     UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1152     if(attrString == UNICODE_STRING("maxVariable", 11)) {
1153         UColReorderCode max;
1154         if(valueString == UNICODE_STRING("space", 5)) {
1155             max = UCOL_REORDER_CODE_SPACE;
1156         } else if(valueString == UNICODE_STRING("punct", 5)) {
1157             max = UCOL_REORDER_CODE_PUNCTUATION;
1158         } else if(valueString == UNICODE_STRING("symbol", 6)) {
1159             max = UCOL_REORDER_CODE_SYMBOL;
1160         } else if(valueString == UNICODE_STRING("currency", 8)) {
1161             max = UCOL_REORDER_CODE_CURRENCY;
1162         } else {
1163             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1164             infoln(fileLine);
1165             errorCode.set(U_PARSE_ERROR);
1166             return;
1167         }
1168         if(coll != nullptr) {
1169             coll->setMaxVariable(max, errorCode);
1170             if(errorCode.isFailure()) {
1171                 errln("setMaxVariable() failed on line %d: %s",
1172                       (int)fileLineNumber, errorCode.errorName());
1173                 infoln(fileLine);
1174                 return;
1175             }
1176         }
1177         fileLine.remove();
1178         return;
1179     }
1180 
1181     UColAttribute attr;
1182     for(int32_t i = 0;; ++i) {
1183         if(i == UPRV_LENGTHOF(attributes)) {
1184             errln("invalid attribute name on line %d", (int)fileLineNumber);
1185             infoln(fileLine);
1186             errorCode.set(U_PARSE_ERROR);
1187             return;
1188         }
1189         if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1190             attr = attributes[i].attr;
1191             break;
1192         }
1193     }
1194 
1195     UColAttributeValue value;
1196     for(int32_t i = 0;; ++i) {
1197         if(i == UPRV_LENGTHOF(attributeValues)) {
1198             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1199             infoln(fileLine);
1200             errorCode.set(U_PARSE_ERROR);
1201             return;
1202         }
1203         if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1204             value = attributeValues[i].value;
1205             break;
1206         }
1207     }
1208 
1209     if(coll != nullptr) {
1210         coll->setAttribute(attr, value, errorCode);
1211         if(errorCode.isFailure()) {
1212             errln("illegal attribute=value combination on line %d: %s",
1213                   (int)fileLineNumber, errorCode.errorName());
1214             infoln(fileLine);
1215             return;
1216         }
1217     }
1218     fileLine.remove();
1219 }
1220 
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1221 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1222     UVector32 reorderCodes(errorCode);
1223     while(start < fileLine.length()) {
1224         start = skipSpaces(start);
1225         int32_t limit = start;
1226         while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1227         CharString name;
1228         name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1229         int32_t code = CollationRuleParser::getReorderCode(name.data());
1230         if(code < 0) {
1231             if(uprv_stricmp(name.data(), "default") == 0) {
1232                 code = UCOL_REORDER_CODE_DEFAULT;  // -1
1233             } else {
1234                 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1235                 infoln(fileLine);
1236                 errorCode.set(U_PARSE_ERROR);
1237                 return;
1238             }
1239         }
1240         reorderCodes.addElement(code, errorCode);
1241         start = limit;
1242     }
1243     if(coll != nullptr) {
1244         coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1245         if(errorCode.isFailure()) {
1246             errln("setReorderCodes() failed on line %d: %s",
1247                   (int)fileLineNumber, errorCode.errorName());
1248             infoln(fileLine);
1249             return;
1250         }
1251     }
1252     fileLine.remove();
1253 }
1254 
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1255 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1256     UnicodeString rules;
1257     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1258         rules.append(fileLine.unescape());
1259     }
1260     if(errorCode.isFailure()) { return; }
1261     logln(rules);
1262 
1263     UParseError parseError;
1264     UnicodeString reason;
1265     delete coll;
1266     coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1267     if(coll == nullptr) {
1268         errln("unable to allocate a new collator");
1269         errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1270         return;
1271     }
1272     if(errorCode.isFailure()) {
1273         dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1274         infoln(UnicodeString("  reason: ") + reason);
1275         if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1276         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1277             infoln(UnicodeString("  snippet: ...") +
1278                 parseError.preContext + "(!)" + parseError.postContext + "...");
1279         }
1280         delete coll;
1281         coll = nullptr;
1282         errorCode.reset();
1283     } else {
1284         assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1285                      UnicodeString(), reason);
1286     }
1287 }
1288 
setRootCollator(IcuTestErrorCode & errorCode)1289 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1290     if(errorCode.isFailure()) { return; }
1291     delete coll;
1292     coll = Collator::createInstance(Locale::getRoot(), errorCode);
1293     if(errorCode.isFailure()) {
1294         dataerrln("unable to create a root collator");
1295         return;
1296     }
1297 }
1298 
setLocaleCollator(IcuTestErrorCode & errorCode)1299 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1300     if(errorCode.isFailure()) { return; }
1301     delete coll;
1302     coll = nullptr;
1303     int32_t at = fileLine.indexOf((char16_t)0x40, 9);  // @ is not invariant
1304     if(at >= 0) {
1305         fileLine.setCharAt(at, (char16_t)0x2a);  // *
1306     }
1307     CharString localeID;
1308     localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1309     if(at >= 0) {
1310         localeID.data()[at - 9] = '@';
1311     }
1312     Locale locale(localeID.data());
1313     if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1314         errln("invalid language tag on line %d", (int)fileLineNumber);
1315         infoln(fileLine);
1316         if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1317         return;
1318     }
1319 
1320     logln("creating a collator for locale ID %s", locale.getName());
1321     coll = Collator::createInstance(locale, errorCode);
1322     if(errorCode.isFailure()) {
1323         dataerrln("unable to create a collator for locale %s on line %d",
1324                   locale.getName(), (int)fileLineNumber);
1325         infoln(fileLine);
1326         delete coll;
1327         coll = nullptr;
1328         errorCode.reset();
1329     }
1330 }
1331 
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1332 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1333     if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return true; }
1334     // In some sequences with Tibetan composite vowel signs,
1335     // even if the string passes the FCD check,
1336     // those composites must be decomposed.
1337     // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1338     int32_t index = 0;
1339     while((index = s.indexOf((char16_t)0xf71, index)) >= 0) {
1340         if(++index < s.length()) {
1341             char16_t c = s[index];
1342             if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return true; }
1343         }
1344     }
1345     return false;
1346 }
1347 
getSortKeyParts(const char16_t * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1348 UBool CollationTest::getSortKeyParts(const char16_t *s, int32_t length,
1349                                      CharString &dest, int32_t partSize,
1350                                      IcuTestErrorCode &errorCode) {
1351     if(errorCode.isFailure()) { return false; }
1352     uint8_t part[32];
1353     U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1354     UCharIterator iter;
1355     uiter_setString(&iter, s, length);
1356     uint32_t state[2] = { 0, 0 };
1357     for(;;) {
1358         int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1359         UBool done = partLength < partSize;
1360         if(done) {
1361             // At the end, append the next byte as well which should be 00.
1362             ++partLength;
1363         }
1364         dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1365         if(done) {
1366             return errorCode.isSuccess();
1367         }
1368     }
1369 }
1370 
getCollationKey(const char * norm,const UnicodeString & line,const char16_t * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1371 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1372                                      const char16_t *s, int32_t length,
1373                                      CollationKey &key, IcuTestErrorCode &errorCode) {
1374     if(errorCode.isFailure()) { return false; }
1375     coll->getCollationKey(s, length, key, errorCode);
1376     if(errorCode.isFailure()) {
1377         infoln(fileTestName);
1378         errln("Collator(%s).getCollationKey() failed: %s",
1379               norm, errorCode.errorName());
1380         infoln(line);
1381         return false;
1382     }
1383     int32_t keyLength;
1384     const uint8_t *keyBytes = key.getByteArray(keyLength);
1385     if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1386         infoln(fileTestName);
1387         errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1388               norm);
1389         infoln(line);
1390         infoln(printCollationKey(key));
1391         return false;
1392     }
1393 
1394     int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1395     if(numLevels < UCOL_IDENTICAL) {
1396         ++numLevels;
1397     } else {
1398         numLevels = 5;
1399     }
1400     if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1401         ++numLevels;
1402     }
1403     errorCode.assertSuccess();
1404     int32_t numLevelSeparators = 0;
1405     for(int32_t i = 0; i < (keyLength - 1); ++i) {
1406         uint8_t b = keyBytes[i];
1407         if(b == 0) {
1408             infoln(fileTestName);
1409             errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1410             infoln(line);
1411             infoln(printCollationKey(key));
1412             return false;
1413         }
1414         if(b == 1) { ++numLevelSeparators; }
1415     }
1416     if(numLevelSeparators != (numLevels - 1)) {
1417         infoln(fileTestName);
1418         errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1419               norm, (int)numLevelSeparators, (int)numLevels);
1420         infoln(line);
1421         infoln(printCollationKey(key));
1422         return false;
1423     }
1424 
1425     // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1426     static const int32_t partSizes[] = { 32, 3, 1 };
1427     for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1428         int32_t partSize = partSizes[psi];
1429         CharString parts;
1430         if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1431             infoln(fileTestName);
1432             errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1433                   norm, (int)partSize, errorCode.errorName());
1434             infoln(line);
1435             return false;
1436         }
1437         if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1438             infoln(fileTestName);
1439             errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1440                   norm, (int)partSize);
1441             infoln(line);
1442             infoln(printCollationKey(key));
1443             infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1444             return false;
1445         }
1446     }
1447     return true;
1448 }
1449 
1450 /**
1451  * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1452  * Leaves key unchanged if s does not contain U+FFFE.
1453  * @return true if the key was successfully changed
1454  */
getMergedCollationKey(const char16_t * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1455 UBool CollationTest::getMergedCollationKey(const char16_t *s, int32_t length,
1456                                            CollationKey &key, IcuTestErrorCode &errorCode) {
1457     if(errorCode.isFailure()) { return false; }
1458     LocalMemory<uint8_t> mergedKey;
1459     int32_t mergedKeyLength = 0;
1460     int32_t mergedKeyCapacity = 0;
1461     int32_t sLength = (length >= 0) ? length : u_strlen(s);
1462     int32_t segmentStart = 0;
1463     for(int32_t i = 0;;) {
1464         if(i == sLength) {
1465             if(segmentStart == 0) {
1466                 // s does not contain any U+FFFE.
1467                 return false;
1468             }
1469         } else if(s[i] != 0xfffe) {
1470             ++i;
1471             continue;
1472         }
1473         // Get the sort key for another segment and merge it into mergedKey.
1474         CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1475         CollationKey key2;
1476         coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1477         int32_t key1Length, key2Length;
1478         const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1479         const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1480         uint8_t *dest;
1481         int32_t minCapacity = key1Length + key2Length;
1482         if(key1Length > 0) { --minCapacity; }
1483         if(minCapacity <= mergedKeyCapacity) {
1484             dest = mergedKey.getAlias();
1485         } else {
1486             if(minCapacity <= 200) {
1487                 mergedKeyCapacity = 200;
1488             } else if(minCapacity <= 2 * mergedKeyCapacity) {
1489                 mergedKeyCapacity *= 2;
1490             } else {
1491                 mergedKeyCapacity = minCapacity;
1492             }
1493             dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1494         }
1495         U_ASSERT(dest != nullptr || mergedKeyCapacity == 0);
1496         if(key1Length == 0) {
1497             // key2 is the sort key for the first segment.
1498             uprv_memcpy(dest, key2Bytes, key2Length);
1499             mergedKeyLength = key2Length;
1500         } else {
1501             mergedKeyLength =
1502                 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1503                                    dest, mergedKeyCapacity);
1504         }
1505         if(i == sLength) { break; }
1506         segmentStart = ++i;
1507     }
1508     key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1509     return true;
1510 }
1511 
1512 namespace {
1513 
1514 /**
1515  * Replaces unpaired surrogates with U+FFFD.
1516  * Returns s if no replacement was made, otherwise buffer.
1517  */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1518 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1519     int32_t i = 0;
1520     while(i < s.length()) {
1521         UChar32 c = s.char32At(i);
1522         if(U_IS_SURROGATE(c)) {
1523             if(buffer.length() < i) {
1524                 buffer.append(s, buffer.length(), i - buffer.length());
1525             }
1526             buffer.append((char16_t)0xfffd);
1527         }
1528         i += U16_LENGTH(c);
1529     }
1530     if(buffer.isEmpty()) {
1531         return s;
1532     }
1533     if(buffer.length() < i) {
1534         buffer.append(s, buffer.length(), i - buffer.length());
1535     }
1536     return buffer;
1537 }
1538 
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1539 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1540                            UCollationResult order, UBool collHasCaseLevel) {
1541     if(order == UCOL_EQUAL) {
1542         return Collation::NO_LEVEL;
1543     }
1544     int32_t prevKeyLength;
1545     const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1546     int32_t keyLength;
1547     const uint8_t *bytes = key.getByteArray(keyLength);
1548     int32_t level = Collation::PRIMARY_LEVEL;
1549     for(int32_t i = 0;; ++i) {
1550         uint8_t b = prevBytes[i];
1551         if(b != bytes[i]) { break; }
1552         if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1553             ++level;
1554             if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1555                 ++level;
1556             }
1557         }
1558     }
1559     return level;
1560 }
1561 
1562 }
1563 
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1564 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1565                                      const UnicodeString &prevString, const UnicodeString &s,
1566                                      UCollationResult expectedOrder, Collation::Level expectedLevel,
1567                                      IcuTestErrorCode &errorCode) {
1568     if(errorCode.isFailure()) { return false; }
1569 
1570     // Get the sort keys first, for error debug output.
1571     CollationKey prevKey;
1572     if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1573                         prevKey, errorCode)) {
1574         return false;
1575     }
1576     CollationKey key;
1577     if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return false; }
1578 
1579     UCollationResult order = coll->compare(prevString, s, errorCode);
1580     if(order != expectedOrder || errorCode.isFailure()) {
1581         infoln(fileTestName);
1582         errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1583               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1584         infoln(prevFileLine);
1585         infoln(fileLine);
1586         infoln(printCollationKey(prevKey));
1587         infoln(printCollationKey(key));
1588         return false;
1589     }
1590     order = coll->compare(s, prevString, errorCode);
1591     if(order != -expectedOrder || errorCode.isFailure()) {
1592         infoln(fileTestName);
1593         errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1594               (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1595         infoln(prevFileLine);
1596         infoln(fileLine);
1597         infoln(printCollationKey(prevKey));
1598         infoln(printCollationKey(key));
1599         return false;
1600     }
1601     // Test NUL-termination if the strings do not contain NUL characters.
1602     UBool containNUL = prevString.indexOf((char16_t)0) >= 0 || s.indexOf((char16_t)0) >= 0;
1603     if(!containNUL) {
1604         order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1605         if(order != expectedOrder || errorCode.isFailure()) {
1606             infoln(fileTestName);
1607             errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1608                   (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1609             infoln(prevFileLine);
1610             infoln(fileLine);
1611             infoln(printCollationKey(prevKey));
1612             infoln(printCollationKey(key));
1613             return false;
1614         }
1615         order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1616         if(order != -expectedOrder || errorCode.isFailure()) {
1617             infoln(fileTestName);
1618             errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1619                   (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1620             infoln(prevFileLine);
1621             infoln(fileLine);
1622             infoln(printCollationKey(prevKey));
1623             infoln(printCollationKey(key));
1624             return false;
1625         }
1626     }
1627 
1628     // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1629     // Unpaired surrogates cannot be converted to UTF-8.
1630     // Create valid UTF-16 strings if necessary, and use those for
1631     // both the expected compare() result and for the input to compare(UTF-8).
1632     UnicodeString prevBuffer, sBuffer;
1633     const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1634     const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1635     std::string prevUTF8, sUTF8;
1636     UnicodeString(prevValid).toUTF8String(prevUTF8);
1637     UnicodeString(sValid).toUTF8String(sUTF8);
1638     UCollationResult expectedUTF8Order;
1639     if(&prevValid == &prevString && &sValid == &s) {
1640         expectedUTF8Order = expectedOrder;
1641     } else {
1642         expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1643     }
1644 
1645     order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1646     if(order != expectedUTF8Order || errorCode.isFailure()) {
1647         infoln(fileTestName);
1648         errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1649               (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1650         infoln(prevFileLine);
1651         infoln(fileLine);
1652         infoln(printCollationKey(prevKey));
1653         infoln(printCollationKey(key));
1654         return false;
1655     }
1656     order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1657     if(order != -expectedUTF8Order || errorCode.isFailure()) {
1658         infoln(fileTestName);
1659         errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1660               (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1661         infoln(prevFileLine);
1662         infoln(fileLine);
1663         infoln(printCollationKey(prevKey));
1664         infoln(printCollationKey(key));
1665         return false;
1666     }
1667     // Test NUL-termination if the strings do not contain NUL characters.
1668     if(!containNUL) {
1669         order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1670         if(order != expectedUTF8Order || errorCode.isFailure()) {
1671             infoln(fileTestName);
1672             errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1673                   (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1674             infoln(prevFileLine);
1675             infoln(fileLine);
1676             infoln(printCollationKey(prevKey));
1677             infoln(printCollationKey(key));
1678             return false;
1679         }
1680         order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1681         if(order != -expectedUTF8Order || errorCode.isFailure()) {
1682             infoln(fileTestName);
1683             errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1684                   (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1685             infoln(prevFileLine);
1686             infoln(fileLine);
1687             infoln(printCollationKey(prevKey));
1688             infoln(printCollationKey(key));
1689             return false;
1690         }
1691     }
1692 
1693     UCharIterator leftIter;
1694     UCharIterator rightIter;
1695     uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1696     uiter_setString(&rightIter, s.getBuffer(), s.length());
1697     order = coll->compare(leftIter, rightIter, errorCode);
1698     if(order != expectedOrder || errorCode.isFailure()) {
1699         infoln(fileTestName);
1700         errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1701               "wrong order: %d != %d (%s)",
1702               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1703         infoln(prevFileLine);
1704         infoln(fileLine);
1705         infoln(printCollationKey(prevKey));
1706         infoln(printCollationKey(key));
1707         return false;
1708     }
1709 
1710     order = prevKey.compareTo(key, errorCode);
1711     if(order != expectedOrder || errorCode.isFailure()) {
1712         infoln(fileTestName);
1713         errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1714               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1715         infoln(prevFileLine);
1716         infoln(fileLine);
1717         infoln(printCollationKey(prevKey));
1718         infoln(printCollationKey(key));
1719         return false;
1720     }
1721     UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1722     int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1723     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1724         if(level != expectedLevel) {
1725             infoln(fileTestName);
1726             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1727                   (int)fileLineNumber, norm, order, level, expectedLevel);
1728             infoln(prevFileLine);
1729             infoln(fileLine);
1730             infoln(printCollationKey(prevKey));
1731             infoln(printCollationKey(key));
1732             return false;
1733         }
1734     }
1735 
1736     // If either string contains U+FFFE, then their sort keys must compare the same as
1737     // the merged sort keys of each string's between-FFFE segments.
1738     //
1739     // It is not required that
1740     //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1741     // only that those two methods yield the same order.
1742     //
1743     // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1744     if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1745                 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1746             errorCode.isFailure()) {
1747         order = prevKey.compareTo(key, errorCode);
1748         if(order != expectedOrder || errorCode.isFailure()) {
1749             infoln(fileTestName);
1750             errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1751                 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1752                 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1753             infoln(prevFileLine);
1754             infoln(fileLine);
1755             infoln(printCollationKey(prevKey));
1756             infoln(printCollationKey(key));
1757             return false;
1758         }
1759         int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1760         if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1761             if(mergedLevel != level) {
1762                 infoln(fileTestName);
1763                 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1764                     "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1765                     (int)fileLineNumber, norm, order, mergedLevel, level);
1766                 infoln(prevFileLine);
1767                 infoln(fileLine);
1768                 infoln(printCollationKey(prevKey));
1769                 infoln(printCollationKey(key));
1770                 return false;
1771             }
1772         }
1773     }
1774     return true;
1775 }
1776 
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1777 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1778     if(errorCode.isFailure()) { return; }
1779     UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1780     UnicodeString prevString, s;
1781     prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1782     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1783         // Parse the line even if it will be ignored (when we do not have a Collator)
1784         // in order to report syntax issues.
1785         Collation::Level relation = parseRelationAndString(s, errorCode);
1786         if(errorCode.isFailure()) {
1787             errorCode.reset();
1788             break;
1789         }
1790         if(coll == nullptr) {
1791             // We were unable to create the Collator but continue with tests.
1792             // Ignore test data for this Collator.
1793             // The next Collator creation might work.
1794             continue;
1795         }
1796         UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1797         Collation::Level expectedLevel = relation;
1798         s.getTerminatedBuffer();  // Ensure NUL-termination.
1799         UBool isOk = true;
1800         if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1801             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1802             isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1803                                    expectedOrder, expectedLevel, errorCode);
1804         }
1805         if(isOk) {
1806             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1807             isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1808                                    expectedOrder, expectedLevel, errorCode);
1809         }
1810         if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1811             UnicodeString pn = nfd->normalize(prevString, errorCode);
1812             UnicodeString n = nfd->normalize(s, errorCode);
1813             pn.getTerminatedBuffer();
1814             n.getTerminatedBuffer();
1815             errorCode.assertSuccess();
1816             isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1817                                    expectedOrder, expectedLevel, errorCode);
1818         }
1819         if(!isOk) {
1820             errorCode.reset();  // already reported
1821         }
1822         prevFileLine = fileLine;
1823         prevString = s;
1824         prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1825     }
1826 }
1827 
TestDataDriven()1828 void CollationTest::TestDataDriven() {
1829     IcuTestErrorCode errorCode(*this, "TestDataDriven");
1830 
1831     fcd = Normalizer2Factory::getFCDInstance(errorCode);
1832     nfd = Normalizer2::getNFDInstance(errorCode);
1833     if(errorCode.errDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1834         return;
1835     }
1836 
1837     CharString path(getSourceTestData(errorCode), errorCode);
1838     path.appendPathPart("collationtest.txt", errorCode);
1839     const char *codePage = "UTF-8";
1840     LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, true, false, errorCode));
1841     if(errorCode.errIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1842         return;
1843     }
1844     // Read a new line if necessary.
1845     // Sub-parsers leave the first line set that they do not handle.
1846     while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1847         if(!isSectionStarter(fileLine[0])) {
1848             errln("syntax error on line %d", (int)fileLineNumber);
1849             infoln(fileLine);
1850             return;
1851         }
1852         if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1853             fileTestName = fileLine;
1854             logln(fileLine);
1855             fileLine.remove();
1856         } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1857             setRootCollator(errorCode);
1858             fileLine.remove();
1859         } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1860             setLocaleCollator(errorCode);
1861             fileLine.remove();
1862         } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1863             buildTailoring(f.getAlias(), errorCode);
1864         } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1865             parseAndSetAttribute(errorCode);
1866         } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1867             checkCompareStrings(f.getAlias(), errorCode);
1868         } else {
1869             errln("syntax error on line %d", (int)fileLineNumber);
1870             infoln(fileLine);
1871             return;
1872         }
1873     }
1874 }
1875 
TestLongLocale()1876 void CollationTest::TestLongLocale() {
1877     IcuTestErrorCode errorCode(*this, "TestLongLocale");
1878     Locale longLocale("sie__1G_C_CEIE_CEZCX_CSUE_E_EIESZNI2_GB_LM_LMCSUE_LMCSX_"
1879                       "LVARIANT_MMCSIE_STEU_SU1GCEIE_SU6G_SU6SU6G_U_UBGE_UC_"
1880                       "UCEZCSI_UCIE_UZSIU_VARIANT_X@collation=bcs-ukvsz");
1881     LocalPointer<Collator> coll(Collator::createInstance(longLocale, errorCode));
1882 }
1883 
TestHang22414()1884 void CollationTest::TestHang22414() {
1885     IcuTestErrorCode errorCode(*this, "TestHang22414");
1886     const char* cases[] = {
1887         "en", // just make sure the code work.
1888         // The following hang before fixing ICU-22414
1889         "sr-Latn-TH-t-su-BM-u-co-private-unihan-x-lvariant-zxsuhc-vss-vjf-0-kn-"
1890         "uaktmtca-uce66u-vtcb1ik-ubsuuuk8-u3iucls-ue38925l-vau30i-u6uccttg-"
1891         "u1iuylik-u-ueein-zzzz",
1892     };
1893     for(int32_t i = 0; i < UPRV_LENGTHOF(cases); i ++) {
1894         icu::Locale l = icu::Locale::forLanguageTag(cases[i], errorCode);
1895         // Make sure the following won't hang.
1896         LocalPointer<Collator> coll(Collator::createInstance(l, errorCode));
1897         errorCode.reset();
1898     }
1899 }
TestBuilderContextsOverflow()1900 void CollationTest::TestBuilderContextsOverflow() {
1901     IcuTestErrorCode errorCode(*this, "TestBuilderContextsOverflow");
1902     // ICU-20715: Bad memory access in what looks like a bogus CharsTrie after
1903     // intermediate contextual-mappings data overflowed.
1904     // Caused by the CollationDataBuilder using some outdated values when building
1905     // contextual mappings with both prefix and contraction matching.
1906     // Fixed by resetting those outdated values before code looks at them.
1907     char16_t rules[] = {
1908         u'&', 0x10, 0x2ff, 0x503c, 0x4617,
1909         u'=', 0x80, 0x4f7f, 0xff, 0x3c3d, 0x1c4f, 0x3c3c,
1910         u'<', 0, 0, 0, 0, u'|', 0, 0, 0, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f, 0xff,
1911         u'=', 0, u'|', 0, 0, 0, 0, 0, 0, 0x1f00, 0xe30,
1912         0x3035, 0, 0, 0xd200, 0, 0x7f00, 0xff4f, 0x3d00, 0, 0x7c00,
1913         0, 0, 0, 0, 0, 0, 0, 0x301f, 0x350e, 0x30,
1914         0, 0, 0xd2, 0x7c00, 0, 0, 0, 0, 0, 0,
1915         0, 0x301f, 0x350e, 0x30, 0, 0, 0x52d2, 0x2f3c, 0x5552, 0x493c,
1916         0x1f10, 0x1f50, 0x300, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f,
1917         0xff,
1918         u'=', 0, u'|', 0, 0, 0, 0, 0x5000, 0x4617,
1919         u'=', 0x80, 0x4f7f, 0, 0, 0xd200, 0
1920     };
1921     UnicodeString s(false, rules, UPRV_LENGTHOF(rules));
1922     LocalPointer<Collator> coll(new RuleBasedCollator(s, errorCode), errorCode);
1923     if(errorCode.isSuccess()) {
1924         logln("successfully built the Collator");
1925     }
1926 }
1927 
1928 #endif  // !UCONFIG_NO_COLLATION
1929