1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/stringpiece.h"
26 #include "unicode/tblcoll.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uniset.h"
29 #include "unicode/unistr.h"
30 #include "unicode/usetiter.h"
31 #include "unicode/ustring.h"
32 #include "charstr.h"
33 #include "cmemory.h"
34 #include "collation.h"
35 #include "collationdata.h"
36 #include "collationfcd.h"
37 #include "collationiterator.h"
38 #include "collationroot.h"
39 #include "collationrootelements.h"
40 #include "collationruleparser.h"
41 #include "collationweights.h"
42 #include "cstring.h"
43 #include "intltest.h"
44 #include "normalizer2impl.h"
45 #include "ucbuf.h"
46 #include "uhash.h"
47 #include "uitercollationiterator.h"
48 #include "utf16collationiterator.h"
49 #include "utf8collationiterator.h"
50 #include "uvectr32.h"
51 #include "uvectr64.h"
52 #include "writesrc.h"
53
54 class CodePointIterator;
55
56 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
57
58 class CollationTest : public IntlTest {
59 public:
CollationTest()60 CollationTest()
61 : fcd(nullptr), nfd(nullptr),
62 fileLineNumber(0),
63 coll(nullptr) {}
64
~CollationTest()65 ~CollationTest() {
66 delete coll;
67 }
68
69 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override;
70
71 void TestMinMax();
72 void TestImplicits();
73 void TestNulTerminated();
74 void TestIllegalUTF8();
75 void TestShortFCDData();
76 void TestFCD();
77 void TestCollationWeights();
78 void TestRootElements();
79 void TestTailoredElements();
80 void TestDataDriven();
81 void TestLongLocale();
82 void TestBuilderContextsOverflow();
83 void TestHang22414();
84
85 private:
86 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
87 void checkAllocWeights(CollationWeights &cw,
88 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
89 int32_t someLength, int32_t minCount);
90
91 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
92 static UnicodeString printCollationKey(const CollationKey &key);
93
94 // Helpers & fields for data-driven test.
isCROrLF(char16_t c)95 static UBool isCROrLF(char16_t c) { return c == 0xa || c == 0xd; }
isSpace(char16_t c)96 static UBool isSpace(char16_t c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(char16_t c)97 static UBool isSectionStarter(char16_t c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
skipSpaces(int32_t i)98 int32_t skipSpaces(int32_t i) {
99 while(isSpace(fileLine[i])) { ++i; }
100 return i;
101 }
102
103 UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
104 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
105 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
106 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
107 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
108 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
109 void setRootCollator(IcuTestErrorCode &errorCode);
110 void setLocaleCollator(IcuTestErrorCode &errorCode);
111
112 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
113
114 UBool getSortKeyParts(const char16_t *s, int32_t length,
115 CharString &dest, int32_t partSize,
116 IcuTestErrorCode &errorCode);
117 UBool getCollationKey(const char *norm, const UnicodeString &line,
118 const char16_t *s, int32_t length,
119 CollationKey &key, IcuTestErrorCode &errorCode);
120 UBool getMergedCollationKey(const char16_t *s, int32_t length,
121 CollationKey &key, IcuTestErrorCode &errorCode);
122 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
123 const UnicodeString &prevString, const UnicodeString &s,
124 UCollationResult expectedOrder, Collation::Level expectedLevel,
125 IcuTestErrorCode &errorCode);
126 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
127
128 const Normalizer2 *fcd, *nfd;
129 UnicodeString fileLine;
130 int32_t fileLineNumber;
131 UnicodeString fileTestName;
132 Collator *coll;
133 };
134
createCollationTest()135 extern IntlTest *createCollationTest() {
136 return new CollationTest();
137 }
138
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)139 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
140 if(exec) {
141 logln("TestSuite CollationTest: ");
142 }
143 TESTCASE_AUTO_BEGIN;
144 TESTCASE_AUTO(TestMinMax);
145 TESTCASE_AUTO(TestImplicits);
146 TESTCASE_AUTO(TestNulTerminated);
147 TESTCASE_AUTO(TestIllegalUTF8);
148 TESTCASE_AUTO(TestShortFCDData);
149 TESTCASE_AUTO(TestFCD);
150 TESTCASE_AUTO(TestCollationWeights);
151 TESTCASE_AUTO(TestRootElements);
152 TESTCASE_AUTO(TestTailoredElements);
153 TESTCASE_AUTO(TestDataDriven);
154 TESTCASE_AUTO(TestLongLocale);
155 TESTCASE_AUTO(TestBuilderContextsOverflow);
156 TESTCASE_AUTO(TestHang22414);
157 TESTCASE_AUTO_END;
158 }
159
TestMinMax()160 void CollationTest::TestMinMax() {
161 IcuTestErrorCode errorCode(*this, "TestMinMax");
162
163 setRootCollator(errorCode);
164 if(errorCode.isFailure()) {
165 errorCode.reset();
166 return;
167 }
168 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
169 if(rbc == nullptr) {
170 errln("the root collator is not a RuleBasedCollator");
171 return;
172 }
173
174 static const char16_t s[2] = { 0xfffe, 0xffff };
175 UVector64 ces(errorCode);
176 rbc->internalGetCEs(UnicodeString(false, s, 2), ces, errorCode);
177 errorCode.assertSuccess();
178 if(ces.size() != 2) {
179 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
180 return;
181 }
182 int64_t ce = ces.elementAti(0);
183 int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
184 if(ce != expected) {
185 errln("CE(U+fffe)=%04lx != 02..", (long)ce);
186 }
187
188 ce = ces.elementAti(1);
189 expected = Collation::makeCE(Collation::MAX_PRIMARY);
190 if(ce != expected) {
191 errln("CE(U+ffff)=%04lx != max..", (long)ce);
192 }
193 }
194
TestImplicits()195 void CollationTest::TestImplicits() {
196 IcuTestErrorCode errorCode(*this, "TestImplicits");
197
198 const CollationData *cd = CollationRoot::getData(errorCode);
199 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
200 return;
201 }
202
203 // Implicit primary weights should be assigned for the following sets,
204 // and sort in ascending order by set and then code point.
205 // See https://www.unicode.org/reports/tr10/#Implicit_Weights
206
207 // core Han Unified Ideographs
208 UnicodeSet coreHan("[\\p{unified_ideograph}&"
209 "[\\p{Block=CJK_Unified_Ideographs}"
210 "\\p{Block=CJK_Compatibility_Ideographs}]]",
211 errorCode);
212 // all other Unified Han ideographs
213 UnicodeSet otherHan("[\\p{unified ideograph}-"
214 "[\\p{Block=CJK_Unified_Ideographs}"
215 "\\p{Block=CJK_Compatibility_Ideographs}]]",
216 errorCode);
217 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
218 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
219
220 // Starting with CLDR 26/ICU 54, the root Han order may instead be
221 // the Unihan radical-stroke order.
222 // The tests should pass either way, so we only test the order of a small set of Han characters
223 // whose radical-stroke order is the same as their code point order.
224 //
225 // When the radical-stroke data (kRSUnicode) for one of these characters changes
226 // such that it no longer sorts in code point order,
227 // then we need to remove it from this set.
228 // (These changes are easiest to see in the change history of the Unicode Tools file
229 // unicodetools/data/ucd/dev/Unihan/kRSUnicode.txt.)
230 // For example, in Unicode 15.1, U+503B has a kRSUnicode value of 9.9
231 // while the neighboring characters still have 9.8. We remove the out-of-order U+503B.
232 //
233 // FYI: The Unicode Tools program GenerateUnihanCollators prints something like
234 // hanInCPOrder = [一-世丘-丫中-丼举-么乊-习乣-亏...鼢-齡齣-龏龑-龥]
235 // number of original-Unihan characters out of order: 318
236 UnicodeSet someHanInCPOrder(
237 u"[\u4E00-\u4E16\u4E18-\u4E2B\u4E2D-\u4E3C\u4E3E-\u4E48"
238 u"\u4E4A-\u4E60\u4E63-\u4E8F\u4E91-\u4F63\u4F65-\u503A\u503C-\u50F1\u50F3-\u50F6]",
239 errorCode);
240 UnicodeSet inOrder(someHanInCPOrder);
241 inOrder.addAll(unassigned).freeze();
242 if(errorCode.errIfFailureAndReset("UnicodeSet")) {
243 return;
244 }
245 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
246 const char *const setNames[] = { "core Han", "Han extensions", "unassigned" };
247 UChar32 prev = 0;
248 uint32_t prevPrimary = 0;
249 UTF16CollationIterator ci(cd, false, nullptr, nullptr, nullptr);
250 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
251 const char *setName = setNames[i];
252 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
253 while(iter->next()) {
254 UChar32 c = iter->getCodepoint();
255 UnicodeString s(c);
256 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
257 int64_t ce = ci.nextCE(errorCode);
258 int64_t ce2 = ci.nextCE(errorCode);
259 if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
260 return;
261 }
262 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
263 errln("%s: CollationIterator.nextCE(U+%04lx) did not yield exactly one CE",
264 setName, (long)c);
265 continue;
266 }
267 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
268 errln("%s: CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
269 setName, (long)c, (long)(ce & 0xffffffff));
270 continue;
271 }
272 uint32_t primary = (uint32_t)(ce >> 32);
273 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
274 errln("%s: CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
275 setName, (long)c, (long)primary, (long)prev, (long)prevPrimary);
276 }
277 prev = c;
278 prevPrimary = primary;
279 }
280 }
281 }
282
TestNulTerminated()283 void CollationTest::TestNulTerminated() {
284 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
285 const CollationData *data = CollationRoot::getData(errorCode);
286 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
287 return;
288 }
289
290 static const char16_t s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
291
292 UTF16CollationIterator ci1(data, false, s, s, s + 2);
293 UTF16CollationIterator ci2(data, false, s + 2, s + 2, nullptr);
294 for(int32_t i = 0;; ++i) {
295 int64_t ce1 = ci1.nextCE(errorCode);
296 int64_t ce2 = ci2.nextCE(errorCode);
297 if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
298 return;
299 }
300 if(ce1 != ce2) {
301 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
302 break;
303 }
304 if(ce1 == Collation::NO_CE) { break; }
305 }
306 }
307
TestIllegalUTF8()308 void CollationTest::TestIllegalUTF8() {
309 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
310
311 setRootCollator(errorCode);
312 if(errorCode.isFailure()) {
313 errorCode.reset();
314 return;
315 }
316 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
317
318 static const StringPiece strings[] = {
319 // string with U+FFFD == illegal byte sequence
320 u8"a\uFFFDz", "a\x80z", // trail byte
321 u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form
322 u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form
323 u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800
324 u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
325 u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form
326 u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000
327 };
328
329 for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
330 StringPiece fffd(strings[i]);
331 StringPiece illegal(strings[i + 1]);
332 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
333 if(order != UCOL_EQUAL) {
334 errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
335 (int)i, order);
336 }
337 }
338 }
339
340 namespace {
341
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)342 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
343 for(UChar32 c = 0x10000; c < 0x110000;) {
344 UChar32 next = c + 0x400;
345 if(src.containsSome(c, next - 1)) {
346 dest.add(U16_LEAD(c));
347 }
348 c = next;
349 }
350 }
351
352 } // namespace
353
TestShortFCDData()354 void CollationTest::TestShortFCDData() {
355 // See CollationFCD class comments.
356 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
357 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
358 errorCode.assertSuccess();
359 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
360 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
361 UnicodeSet lccc; // actual
362 for(UChar32 c = 0; c <= 0xffff; ++c) {
363 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
364 }
365 UnicodeSet diff(expectedLccc);
366 diff.removeAll(lccc);
367 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
368 UnicodeString empty("[]");
369 UnicodeString diffString;
370 diff.toPattern(diffString, true);
371 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
372 diff = lccc;
373 diff.removeAll(expectedLccc);
374 diff.toPattern(diffString, true);
375 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, true);
376
377 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
378 if (errorCode.isSuccess()) {
379 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
380 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
381 UnicodeSet tccc; // actual
382 for(UChar32 c = 0; c <= 0xffff; ++c) {
383 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
384 }
385 diff = expectedTccc;
386 diff.removeAll(tccc);
387 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
388 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
389 diff = tccc;
390 diff.removeAll(expectedTccc);
391 diff.toPattern(diffString, true);
392 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
393 }
394 }
395
396 class CodePointIterator {
397 public:
CodePointIterator(const UChar32 * cp,int32_t length)398 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()399 void resetToStart() { pos = 0; }
next()400 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()401 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const402 int32_t getLength() const { return length; }
getIndex() const403 int getIndex() const { return (int)pos; }
404 private:
405 const UChar32 *cp;
406 int32_t length;
407 int32_t pos;
408 };
409
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)410 void CollationTest::checkFCD(const char *name,
411 CollationIterator &ci, CodePointIterator &cpi) {
412 IcuTestErrorCode errorCode(*this, "checkFCD");
413
414 // Iterate forward to the limit.
415 for(;;) {
416 UChar32 c1 = ci.nextCodePoint(errorCode);
417 UChar32 c2 = cpi.next();
418 if(c1 != c2) {
419 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
420 name, (long)c1, (long)c2, cpi.getIndex());
421 return;
422 }
423 if(c1 < 0) { break; }
424 }
425
426 // Iterate backward most of the way.
427 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
428 UChar32 c1 = ci.previousCodePoint(errorCode);
429 UChar32 c2 = cpi.previous();
430 if(c1 != c2) {
431 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
432 name, (long)c1, (long)c2, cpi.getIndex());
433 return;
434 }
435 }
436
437 // Forward again.
438 for(;;) {
439 UChar32 c1 = ci.nextCodePoint(errorCode);
440 UChar32 c2 = cpi.next();
441 if(c1 != c2) {
442 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
443 name, (long)c1, (long)c2, cpi.getIndex());
444 return;
445 }
446 if(c1 < 0) { break; }
447 }
448
449 // Iterate backward to the start.
450 for(;;) {
451 UChar32 c1 = ci.previousCodePoint(errorCode);
452 UChar32 c2 = cpi.previous();
453 if(c1 != c2) {
454 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
455 name, (long)c1, (long)c2, cpi.getIndex());
456 return;
457 }
458 if(c1 < 0) { break; }
459 }
460 }
461
TestFCD()462 void CollationTest::TestFCD() {
463 IcuTestErrorCode errorCode(*this, "TestFCD");
464 const CollationData *data = CollationRoot::getData(errorCode);
465 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
466 return;
467 }
468
469 // Input string, not FCD, NUL-terminated.
470 static const char16_t s[] = {
471 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
472 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
473 0x327, 0x308, // ccc=202, 230
474 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
475 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
476 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
477 0xac01,
478 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
479 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
480 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
481 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
482 0x4e00, 0xf81,
483 0
484 };
485 // Expected code points.
486 static const UChar32 cp[] = {
487 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
488 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
489 0x1D15F, 0x1D16D,
490 0xac01,
491 0x63, 0x327, 0x1D165, 0x1D16D,
492 0x61,
493 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
494 0x4e00, 0xf71, 0xf80
495 };
496
497 FCDUTF16CollationIterator u16ci(data, false, s, s, nullptr);
498 if(errorCode.errIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
499 return;
500 }
501 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
502 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
503
504 cpi.resetToStart();
505 std::string utf8;
506 UnicodeString(s).toUTF8String(utf8);
507 FCDUTF8CollationIterator u8ci(data, false,
508 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
509 if(errorCode.errIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
510 return;
511 }
512 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
513
514 cpi.resetToStart();
515 UCharIterator iter;
516 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the terminating NUL
517 FCDUIterCollationIterator uici(data, false, iter, 0);
518 if(errorCode.errIfFailureAndReset("FCDUIterCollationIterator constructor")) {
519 return;
520 }
521 checkFCD("FCDUIterCollationIterator", uici, cpi);
522 }
523
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)524 void CollationTest::checkAllocWeights(CollationWeights &cw,
525 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
526 int32_t someLength, int32_t minCount) {
527 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
528 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = false",
529 (long)lowerLimit, (long)upperLimit, (long)n);
530 return;
531 }
532 uint32_t previous = lowerLimit;
533 int32_t count = 0; // number of weights that have someLength
534 for(int32_t i = 0; i < n; ++i) {
535 uint32_t w = cw.nextWeight();
536 if(w == 0xffffffff) {
537 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
538 "returns only %ld weights",
539 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
540 return;
541 }
542 if(!(previous < w && w < upperLimit)) {
543 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
544 "number %ld -> %lx not between %lx and %lx",
545 (long)lowerLimit, (long)upperLimit, (long)n,
546 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
547 return;
548 }
549 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
550 }
551 if(count < minCount) {
552 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
553 "returns only %ld < %ld weights of length %d",
554 (long)lowerLimit, (long)upperLimit, (long)n,
555 (long)count, (long)minCount, (int)someLength);
556 }
557 }
558
TestCollationWeights()559 void CollationTest::TestCollationWeights() {
560 CollationWeights cw;
561
562 // Non-compressible primaries use 254 second bytes 02..FF.
563 logln("CollationWeights.initForPrimary(non-compressible)");
564 cw.initForPrimary(false);
565 // Expect 1 weight 11 and 254 weights 12xx.
566 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
567 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
568 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
569 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
570 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
571 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
572 // Expect 254^2=64516 three-byte weights.
573 // During computation, there should be 3 three-byte ranges
574 // 10ffff, 11xxxx, 120202.
575 // The middle one should be split 64515:1,
576 // and the newly-split-off range and the last ranged lengthened.
577 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
578 // Expect weights 1102 & 1103.
579 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
580 // Expect weights 102102 & 102103.
581 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
582
583 // Compressible primaries use 251 second bytes 04..FE.
584 logln("CollationWeights.initForPrimary(compressible)");
585 cw.initForPrimary(true);
586 // Expect 1 weight 11 and 251 weights 12xx.
587 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
588 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
589 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
590 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
591 // Expect weights 1104 & 1105.
592 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
593 // Expect weights 102102 & 102103.
594 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
595
596 // Secondary and tertiary weights use only bytes 3 & 4.
597 logln("CollationWeights.initForSecondary()");
598 cw.initForSecondary();
599 // Expect weights fbxx and all four fc..ff.
600 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
601
602 logln("CollationWeights.initForTertiary()");
603 cw.initForTertiary();
604 // Expect weights 3dxx and both 3e & 3f.
605 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
606 }
607
608 namespace {
609
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)610 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
611 uint32_t p, uint32_t s, uint32_t ctq) {
612 uint32_t p1 = p >> 24;
613 uint32_t p2 = (p >> 16) & 0xff;
614 uint32_t p3 = (p >> 8) & 0xff;
615 uint32_t p4 = p & 0xff;
616 uint32_t s1 = s >> 8;
617 uint32_t s2 = s & 0xff;
618 // ctq = Case, Tertiary, Quaternary
619 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
620 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
621 uint32_t t1 = t >> 8;
622 uint32_t t2 = t & 0xff;
623 uint32_t q = ctq & Collation::QUATERNARY_MASK;
624 // No leading zero bytes.
625 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
626 return false;
627 }
628 // No intermediate zero bytes.
629 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
630 return false;
631 }
632 if(p2 != 0 && p3 == 0 && p4 != 0) {
633 return false;
634 }
635 // Minimum & maximum lead bytes.
636 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
637 s1 == Collation::LEVEL_SEPARATOR_BYTE ||
638 t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
639 return false;
640 }
641 if(c > 2) {
642 return false;
643 }
644 // The valid byte range for the second primary byte depends on compressibility.
645 if(p2 != 0) {
646 if(data.isCompressibleLeadByte(p1)) {
647 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
648 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
649 return false;
650 }
651 } else {
652 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
653 return false;
654 }
655 }
656 }
657 // Other bytes just need to avoid the level separator.
658 // Trailing zeros are ok.
659 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
660 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
661 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
662 return false;
663 }
664 // Well-formed CEs.
665 if(p == 0) {
666 if(s == 0) {
667 if(t == 0) {
668 // Completely ignorable CE.
669 // Quaternary CEs are not supported.
670 if(c != 0 || q != 0) {
671 return false;
672 }
673 } else {
674 // Tertiary CE.
675 if(t < re.getTertiaryBoundary() || c != 2) {
676 return false;
677 }
678 }
679 } else {
680 // Secondary CE.
681 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
682 return false;
683 }
684 }
685 } else {
686 // Primary CE.
687 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
688 s >= re.getSecondaryBoundary()) {
689 return false;
690 }
691 if(t == 0 || t >= re.getTertiaryBoundary()) {
692 return false;
693 }
694 }
695 return true;
696 }
697
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)698 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
699 uint32_t p = (uint32_t)(ce >> 32);
700 uint32_t secTer = (uint32_t)ce;
701 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
702 }
703
704 class RootElementsIterator {
705 public:
RootElementsIterator(const CollationData & root)706 RootElementsIterator(const CollationData &root)
707 : data(root),
708 elements(root.rootElements), length(root.rootElementsLength),
709 pri(0), secTer(0),
710 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
711
next()712 UBool next() {
713 if(index >= length) { return false; }
714 uint32_t p = elements[index];
715 if(p == CollationRootElements::PRIMARY_SENTINEL) { return false; }
716 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
717 ++index;
718 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
719 return true;
720 }
721 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
722 // End of a range, enumerate the primaries in the range.
723 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
724 p &= 0xffffff00;
725 if(pri == p) {
726 // Finished the range, return the next CE after it.
727 ++index;
728 return next();
729 }
730 U_ASSERT(pri < p);
731 // Return the next primary in this range.
732 UBool isCompressible = data.isCompressiblePrimary(pri);
733 if((pri & 0xffff) == 0) {
734 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
735 } else {
736 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
737 }
738 return true;
739 }
740 // Simple primary CE.
741 ++index;
742 pri = p;
743 // Does this have an explicit below-common sec/ter unit,
744 // or does it imply a common one?
745 if(index == length) {
746 secTer = Collation::COMMON_SEC_AND_TER_CE;
747 } else {
748 secTer = elements[index];
749 if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
750 // No sec/ter delta.
751 secTer = Collation::COMMON_SEC_AND_TER_CE;
752 } else {
753 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
754 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
755 // Implied sec/ter.
756 secTer = Collation::COMMON_SEC_AND_TER_CE;
757 } else {
758 // Explicit sec/ter below common/common.
759 ++index;
760 }
761 }
762 }
763 return true;
764 }
765
getPrimary() const766 uint32_t getPrimary() const { return pri; }
getSecTer() const767 uint32_t getSecTer() const { return secTer; }
768
769 private:
770 const CollationData &data;
771 const uint32_t *elements;
772 int32_t length;
773
774 uint32_t pri;
775 uint32_t secTer;
776 int32_t index;
777 };
778
779 } // namespace
780
TestRootElements()781 void CollationTest::TestRootElements() {
782 IcuTestErrorCode errorCode(*this, "TestRootElements");
783 const CollationData *root = CollationRoot::getData(errorCode);
784 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
785 return;
786 }
787 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
788 RootElementsIterator iter(*root);
789
790 // We check each root CE for validity,
791 // and we also verify that there is a tailoring gap between each two CEs.
792 CollationWeights cw1c; // compressible primary weights
793 CollationWeights cw1u; // uncompressible primary weights
794 CollationWeights cw2;
795 CollationWeights cw3;
796
797 cw1c.initForPrimary(true);
798 cw1u.initForPrimary(false);
799 cw2.initForSecondary();
800 cw3.initForTertiary();
801
802 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
803 // nor the special merge-separator CE for U+FFFE.
804 uint32_t prevPri = 0;
805 uint32_t prevSec = 0;
806 uint32_t prevTer = 0;
807 while(iter.next()) {
808 uint32_t pri = iter.getPrimary();
809 uint32_t secTer = iter.getSecTer();
810 // CollationRootElements CEs must have 0 case and quaternary bits.
811 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
812 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
813 (long)pri, (long)secTer);
814 }
815 uint32_t sec = secTer >> 16;
816 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
817 uint32_t ctq = ter;
818 if(pri == 0 && sec == 0 && ter != 0) {
819 // Tertiary CEs must have uppercase bits,
820 // but they are not stored in the CollationRootElements.
821 ctq |= 0x8000;
822 }
823 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
824 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
825 } else {
826 if(pri != prevPri) {
827 uint32_t newWeight = 0;
828 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
829 // There is currently no tailoring gap after primary ignorables,
830 // and we forbid tailoring after U+FFFD and U+FFFF.
831 } else if(root->isCompressiblePrimary(prevPri)) {
832 if(!cw1c.allocWeights(prevPri, pri, 1)) {
833 errln("no primary/compressible tailoring gap between %08lx and %08lx",
834 (long)prevPri, (long)pri);
835 } else {
836 newWeight = cw1c.nextWeight();
837 }
838 } else {
839 if(!cw1u.allocWeights(prevPri, pri, 1)) {
840 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
841 (long)prevPri, (long)pri);
842 } else {
843 newWeight = cw1u.nextWeight();
844 }
845 }
846 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
847 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
848 (long)prevPri, (long)newWeight, (long)pri);
849 }
850 } else if(sec != prevSec) {
851 uint32_t lowerLimit =
852 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
853 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
854 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
855 } else {
856 uint32_t newWeight = cw2.nextWeight();
857 if(!(prevSec < newWeight && newWeight < sec)) {
858 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
859 (long)lowerLimit, (long)newWeight, (long)sec);
860 }
861 }
862 } else if(ter != prevTer) {
863 uint32_t lowerLimit =
864 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
865 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
866 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
867 } else {
868 uint32_t newWeight = cw3.nextWeight();
869 if(!(prevTer < newWeight && newWeight < ter)) {
870 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
871 (long)lowerLimit, (long)newWeight, (long)ter);
872 }
873 }
874 } else {
875 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
876 }
877 }
878 prevPri = pri;
879 prevSec = sec;
880 prevTer = ter;
881 }
882 }
883
TestTailoredElements()884 void CollationTest::TestTailoredElements() {
885 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
886 const CollationData *root = CollationRoot::getData(errorCode);
887 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
888 return;
889 }
890 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
891
892 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, nullptr, errorCode);
893 if(errorCode.errIfFailureAndReset("failed to create a hash table")) {
894 return;
895 }
896 uhash_setKeyDeleter(prevLocales, uprv_free);
897 // TestRootElements() tests the root collator which does not have tailorings.
898 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
899 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
900 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
901
902 UVector64 ces(errorCode);
903 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
904 U_ASSERT(locales.isValid());
905 const char *localeID = "root";
906 do {
907 Locale locale(localeID);
908 LocalPointer<StringEnumeration> types(
909 Collator::getKeywordValuesForLocale("collation", locale, false, errorCode));
910 errorCode.assertSuccess();
911 const char *type; // first: default type
912 while((type = types->next(nullptr, errorCode)) != nullptr) {
913 if(strncmp(type, "private-", 8) == 0) {
914 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
915 localeID, type);
916 }
917 Locale localeWithType(locale);
918 localeWithType.setKeywordValue("collation", type, errorCode);
919 errorCode.assertSuccess();
920 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
921 if(errorCode.errIfFailureAndReset("Collator::createInstance(%s)",
922 localeWithType.getName())) {
923 continue;
924 }
925 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
926 if(uhash_geti(prevLocales, actual.getName()) != 0) {
927 continue;
928 }
929 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
930 errorCode.assertSuccess();
931 logln("TestTailoredElements(): requested %s -> actual %s",
932 localeWithType.getName(), actual.getName());
933 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
934 if(rbc == nullptr) {
935 continue;
936 }
937 // Note: It would be better to get tailored strings such that we can
938 // identify the prefix, and only get the CEs for the prefix+string,
939 // not also for the prefix.
940 // There is currently no API for that.
941 // It would help in an unusual case where a contraction starting in the prefix
942 // extends past its end, and we do not see the intended mapping.
943 // For example, for a mapping p|st, if there is also a contraction ps,
944 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
945 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
946 errorCode.assertSuccess();
947 UnicodeSetIterator iter(*tailored);
948 while(iter.next()) {
949 const UnicodeString &s = iter.getString();
950 ces.removeAllElements();
951 rbc->internalGetCEs(s, ces, errorCode);
952 errorCode.assertSuccess();
953 for(int32_t i = 0; i < ces.size(); ++i) {
954 int64_t ce = ces.elementAti(i);
955 if(!isValidCE(rootElements, *root, ce)) {
956 errln("invalid tailored CE %016llx at CE index %d from string:",
957 (long long)ce, (int)i);
958 infoln(prettify(s));
959 }
960 }
961 }
962 }
963 } while((localeID = locales->next(nullptr, errorCode)) != nullptr);
964 uhash_close(prevLocales);
965 }
966
printSortKey(const uint8_t * p,int32_t length)967 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
968 UnicodeString s;
969 for(int32_t i = 0; i < length; ++i) {
970 if(i > 0) { s.append((char16_t)0x20); }
971 uint8_t b = p[i];
972 if(b == 0) {
973 s.append((char16_t)0x2e); // period
974 } else if(b == 1) {
975 s.append((char16_t)0x7c); // vertical bar
976 } else {
977 appendHex(b, 2, s);
978 }
979 }
980 return s;
981 }
982
printCollationKey(const CollationKey & key)983 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
984 int32_t length;
985 const uint8_t *p = key.getByteArray(length);
986 return printSortKey(p, length);
987 }
988
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)989 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
990 for(;;) {
991 int32_t lineLength;
992 const char16_t *line = ucbuf_readline(f, &lineLength, errorCode);
993 if(line == nullptr || errorCode.isFailure()) {
994 fileLine.remove();
995 return false;
996 }
997 ++fileLineNumber;
998 // Strip trailing CR/LF, comments, and spaces.
999 const char16_t *comment = u_memchr(line, 0x23, lineLength); // '#'
1000 if(comment != nullptr) {
1001 lineLength = (int32_t)(comment - line);
1002 } else {
1003 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
1004 }
1005 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
1006 if(lineLength != 0) {
1007 fileLine.setTo(false, line, lineLength);
1008 return true;
1009 }
1010 // Empty line, continue.
1011 }
1012 }
1013
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)1014 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
1015 UErrorCode &errorCode) {
1016 int32_t length = fileLine.length();
1017 int32_t i;
1018 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
1019 int32_t pipeIndex = fileLine.indexOf((char16_t)0x7c, start, i - start); // '|'
1020 if(pipeIndex >= 0) {
1021 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1022 if(prefix.isEmpty()) {
1023 errln("empty prefix on line %d", (int)fileLineNumber);
1024 infoln(fileLine);
1025 errorCode = U_PARSE_ERROR;
1026 return;
1027 }
1028 start = pipeIndex + 1;
1029 } else {
1030 prefix.remove();
1031 }
1032 s = fileLine.tempSubStringBetween(start, i).unescape();
1033 if(s.isEmpty()) {
1034 errln("empty string on line %d", (int)fileLineNumber);
1035 infoln(fileLine);
1036 errorCode = U_PARSE_ERROR;
1037 return;
1038 }
1039 start = i;
1040 }
1041
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1042 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1043 Collation::Level relation;
1044 int32_t start;
1045 if(fileLine[0] == 0x3c) { // <
1046 char16_t second = fileLine[1];
1047 start = 2;
1048 switch(second) {
1049 case 0x31: // <1
1050 relation = Collation::PRIMARY_LEVEL;
1051 break;
1052 case 0x32: // <2
1053 relation = Collation::SECONDARY_LEVEL;
1054 break;
1055 case 0x33: // <3
1056 relation = Collation::TERTIARY_LEVEL;
1057 break;
1058 case 0x34: // <4
1059 relation = Collation::QUATERNARY_LEVEL;
1060 break;
1061 case 0x63: // <c
1062 relation = Collation::CASE_LEVEL;
1063 break;
1064 case 0x69: // <i
1065 relation = Collation::IDENTICAL_LEVEL;
1066 break;
1067 default: // just <
1068 relation = Collation::NO_LEVEL;
1069 start = 1;
1070 break;
1071 }
1072 } else if(fileLine[0] == 0x3d) { // =
1073 relation = Collation::ZERO_LEVEL;
1074 start = 1;
1075 } else {
1076 start = 0;
1077 }
1078 if(start == 0 || !isSpace(fileLine[start])) {
1079 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1080 infoln(fileLine);
1081 errorCode.set(U_PARSE_ERROR);
1082 return Collation::NO_LEVEL;
1083 }
1084 start = skipSpaces(start);
1085 UnicodeString prefix;
1086 parseString(start, prefix, s, errorCode);
1087 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1088 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1089 infoln(fileLine);
1090 errorCode.set(U_PARSE_ERROR);
1091 return Collation::NO_LEVEL;
1092 }
1093 if(start < fileLine.length()) {
1094 errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1095 infoln(fileLine);
1096 errorCode.set(U_PARSE_ERROR);
1097 return Collation::NO_LEVEL;
1098 }
1099 return relation;
1100 }
1101
1102 static const struct {
1103 const char *name;
1104 UColAttribute attr;
1105 } attributes[] = {
1106 { "backwards", UCOL_FRENCH_COLLATION },
1107 { "alternate", UCOL_ALTERNATE_HANDLING },
1108 { "caseFirst", UCOL_CASE_FIRST },
1109 { "caseLevel", UCOL_CASE_LEVEL },
1110 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1111 { "strength", UCOL_STRENGTH },
1112 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1113 { "numeric", UCOL_NUMERIC_COLLATION }
1114 };
1115
1116 static const struct {
1117 const char *name;
1118 UColAttributeValue value;
1119 } attributeValues[] = {
1120 { "default", UCOL_DEFAULT },
1121 { "primary", UCOL_PRIMARY },
1122 { "secondary", UCOL_SECONDARY },
1123 { "tertiary", UCOL_TERTIARY },
1124 { "quaternary", UCOL_QUATERNARY },
1125 { "identical", UCOL_IDENTICAL },
1126 { "off", UCOL_OFF },
1127 { "on", UCOL_ON },
1128 { "shifted", UCOL_SHIFTED },
1129 { "non-ignorable", UCOL_NON_IGNORABLE },
1130 { "lower", UCOL_LOWER_FIRST },
1131 { "upper", UCOL_UPPER_FIRST }
1132 };
1133
parseAndSetAttribute(IcuTestErrorCode & errorCode)1134 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1135 // Parse attributes even if the Collator could not be created,
1136 // in order to report syntax errors.
1137 int32_t start = skipSpaces(1);
1138 int32_t equalPos = fileLine.indexOf((char16_t)0x3d);
1139 if(equalPos < 0) {
1140 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1141 parseAndSetReorderCodes(start + 7, errorCode);
1142 return;
1143 }
1144 errln("missing '=' on line %d", (int)fileLineNumber);
1145 infoln(fileLine);
1146 errorCode.set(U_PARSE_ERROR);
1147 return;
1148 }
1149
1150 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1151 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1152 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1153 UColReorderCode max;
1154 if(valueString == UNICODE_STRING("space", 5)) {
1155 max = UCOL_REORDER_CODE_SPACE;
1156 } else if(valueString == UNICODE_STRING("punct", 5)) {
1157 max = UCOL_REORDER_CODE_PUNCTUATION;
1158 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1159 max = UCOL_REORDER_CODE_SYMBOL;
1160 } else if(valueString == UNICODE_STRING("currency", 8)) {
1161 max = UCOL_REORDER_CODE_CURRENCY;
1162 } else {
1163 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1164 infoln(fileLine);
1165 errorCode.set(U_PARSE_ERROR);
1166 return;
1167 }
1168 if(coll != nullptr) {
1169 coll->setMaxVariable(max, errorCode);
1170 if(errorCode.isFailure()) {
1171 errln("setMaxVariable() failed on line %d: %s",
1172 (int)fileLineNumber, errorCode.errorName());
1173 infoln(fileLine);
1174 return;
1175 }
1176 }
1177 fileLine.remove();
1178 return;
1179 }
1180
1181 UColAttribute attr;
1182 for(int32_t i = 0;; ++i) {
1183 if(i == UPRV_LENGTHOF(attributes)) {
1184 errln("invalid attribute name on line %d", (int)fileLineNumber);
1185 infoln(fileLine);
1186 errorCode.set(U_PARSE_ERROR);
1187 return;
1188 }
1189 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1190 attr = attributes[i].attr;
1191 break;
1192 }
1193 }
1194
1195 UColAttributeValue value;
1196 for(int32_t i = 0;; ++i) {
1197 if(i == UPRV_LENGTHOF(attributeValues)) {
1198 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1199 infoln(fileLine);
1200 errorCode.set(U_PARSE_ERROR);
1201 return;
1202 }
1203 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1204 value = attributeValues[i].value;
1205 break;
1206 }
1207 }
1208
1209 if(coll != nullptr) {
1210 coll->setAttribute(attr, value, errorCode);
1211 if(errorCode.isFailure()) {
1212 errln("illegal attribute=value combination on line %d: %s",
1213 (int)fileLineNumber, errorCode.errorName());
1214 infoln(fileLine);
1215 return;
1216 }
1217 }
1218 fileLine.remove();
1219 }
1220
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1221 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1222 UVector32 reorderCodes(errorCode);
1223 while(start < fileLine.length()) {
1224 start = skipSpaces(start);
1225 int32_t limit = start;
1226 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1227 CharString name;
1228 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1229 int32_t code = CollationRuleParser::getReorderCode(name.data());
1230 if(code < 0) {
1231 if(uprv_stricmp(name.data(), "default") == 0) {
1232 code = UCOL_REORDER_CODE_DEFAULT; // -1
1233 } else {
1234 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1235 infoln(fileLine);
1236 errorCode.set(U_PARSE_ERROR);
1237 return;
1238 }
1239 }
1240 reorderCodes.addElement(code, errorCode);
1241 start = limit;
1242 }
1243 if(coll != nullptr) {
1244 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1245 if(errorCode.isFailure()) {
1246 errln("setReorderCodes() failed on line %d: %s",
1247 (int)fileLineNumber, errorCode.errorName());
1248 infoln(fileLine);
1249 return;
1250 }
1251 }
1252 fileLine.remove();
1253 }
1254
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1255 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1256 UnicodeString rules;
1257 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1258 rules.append(fileLine.unescape());
1259 }
1260 if(errorCode.isFailure()) { return; }
1261 logln(rules);
1262
1263 UParseError parseError;
1264 UnicodeString reason;
1265 delete coll;
1266 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1267 if(coll == nullptr) {
1268 errln("unable to allocate a new collator");
1269 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1270 return;
1271 }
1272 if(errorCode.isFailure()) {
1273 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1274 infoln(UnicodeString(" reason: ") + reason);
1275 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); }
1276 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1277 infoln(UnicodeString(" snippet: ...") +
1278 parseError.preContext + "(!)" + parseError.postContext + "...");
1279 }
1280 delete coll;
1281 coll = nullptr;
1282 errorCode.reset();
1283 } else {
1284 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1285 UnicodeString(), reason);
1286 }
1287 }
1288
setRootCollator(IcuTestErrorCode & errorCode)1289 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1290 if(errorCode.isFailure()) { return; }
1291 delete coll;
1292 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1293 if(errorCode.isFailure()) {
1294 dataerrln("unable to create a root collator");
1295 return;
1296 }
1297 }
1298
setLocaleCollator(IcuTestErrorCode & errorCode)1299 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1300 if(errorCode.isFailure()) { return; }
1301 delete coll;
1302 coll = nullptr;
1303 int32_t at = fileLine.indexOf((char16_t)0x40, 9); // @ is not invariant
1304 if(at >= 0) {
1305 fileLine.setCharAt(at, (char16_t)0x2a); // *
1306 }
1307 CharString localeID;
1308 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1309 if(at >= 0) {
1310 localeID.data()[at - 9] = '@';
1311 }
1312 Locale locale(localeID.data());
1313 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1314 errln("invalid language tag on line %d", (int)fileLineNumber);
1315 infoln(fileLine);
1316 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1317 return;
1318 }
1319
1320 logln("creating a collator for locale ID %s", locale.getName());
1321 coll = Collator::createInstance(locale, errorCode);
1322 if(errorCode.isFailure()) {
1323 dataerrln("unable to create a collator for locale %s on line %d",
1324 locale.getName(), (int)fileLineNumber);
1325 infoln(fileLine);
1326 delete coll;
1327 coll = nullptr;
1328 errorCode.reset();
1329 }
1330 }
1331
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1332 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1333 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return true; }
1334 // In some sequences with Tibetan composite vowel signs,
1335 // even if the string passes the FCD check,
1336 // those composites must be decomposed.
1337 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1338 int32_t index = 0;
1339 while((index = s.indexOf((char16_t)0xf71, index)) >= 0) {
1340 if(++index < s.length()) {
1341 char16_t c = s[index];
1342 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return true; }
1343 }
1344 }
1345 return false;
1346 }
1347
getSortKeyParts(const char16_t * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1348 UBool CollationTest::getSortKeyParts(const char16_t *s, int32_t length,
1349 CharString &dest, int32_t partSize,
1350 IcuTestErrorCode &errorCode) {
1351 if(errorCode.isFailure()) { return false; }
1352 uint8_t part[32];
1353 U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1354 UCharIterator iter;
1355 uiter_setString(&iter, s, length);
1356 uint32_t state[2] = { 0, 0 };
1357 for(;;) {
1358 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1359 UBool done = partLength < partSize;
1360 if(done) {
1361 // At the end, append the next byte as well which should be 00.
1362 ++partLength;
1363 }
1364 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1365 if(done) {
1366 return errorCode.isSuccess();
1367 }
1368 }
1369 }
1370
getCollationKey(const char * norm,const UnicodeString & line,const char16_t * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1371 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1372 const char16_t *s, int32_t length,
1373 CollationKey &key, IcuTestErrorCode &errorCode) {
1374 if(errorCode.isFailure()) { return false; }
1375 coll->getCollationKey(s, length, key, errorCode);
1376 if(errorCode.isFailure()) {
1377 infoln(fileTestName);
1378 errln("Collator(%s).getCollationKey() failed: %s",
1379 norm, errorCode.errorName());
1380 infoln(line);
1381 return false;
1382 }
1383 int32_t keyLength;
1384 const uint8_t *keyBytes = key.getByteArray(keyLength);
1385 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1386 infoln(fileTestName);
1387 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1388 norm);
1389 infoln(line);
1390 infoln(printCollationKey(key));
1391 return false;
1392 }
1393
1394 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1395 if(numLevels < UCOL_IDENTICAL) {
1396 ++numLevels;
1397 } else {
1398 numLevels = 5;
1399 }
1400 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1401 ++numLevels;
1402 }
1403 errorCode.assertSuccess();
1404 int32_t numLevelSeparators = 0;
1405 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1406 uint8_t b = keyBytes[i];
1407 if(b == 0) {
1408 infoln(fileTestName);
1409 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1410 infoln(line);
1411 infoln(printCollationKey(key));
1412 return false;
1413 }
1414 if(b == 1) { ++numLevelSeparators; }
1415 }
1416 if(numLevelSeparators != (numLevels - 1)) {
1417 infoln(fileTestName);
1418 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1419 norm, (int)numLevelSeparators, (int)numLevels);
1420 infoln(line);
1421 infoln(printCollationKey(key));
1422 return false;
1423 }
1424
1425 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1426 static const int32_t partSizes[] = { 32, 3, 1 };
1427 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1428 int32_t partSize = partSizes[psi];
1429 CharString parts;
1430 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1431 infoln(fileTestName);
1432 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1433 norm, (int)partSize, errorCode.errorName());
1434 infoln(line);
1435 return false;
1436 }
1437 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1438 infoln(fileTestName);
1439 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1440 norm, (int)partSize);
1441 infoln(line);
1442 infoln(printCollationKey(key));
1443 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1444 return false;
1445 }
1446 }
1447 return true;
1448 }
1449
1450 /**
1451 * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1452 * Leaves key unchanged if s does not contain U+FFFE.
1453 * @return true if the key was successfully changed
1454 */
getMergedCollationKey(const char16_t * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1455 UBool CollationTest::getMergedCollationKey(const char16_t *s, int32_t length,
1456 CollationKey &key, IcuTestErrorCode &errorCode) {
1457 if(errorCode.isFailure()) { return false; }
1458 LocalMemory<uint8_t> mergedKey;
1459 int32_t mergedKeyLength = 0;
1460 int32_t mergedKeyCapacity = 0;
1461 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1462 int32_t segmentStart = 0;
1463 for(int32_t i = 0;;) {
1464 if(i == sLength) {
1465 if(segmentStart == 0) {
1466 // s does not contain any U+FFFE.
1467 return false;
1468 }
1469 } else if(s[i] != 0xfffe) {
1470 ++i;
1471 continue;
1472 }
1473 // Get the sort key for another segment and merge it into mergedKey.
1474 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1475 CollationKey key2;
1476 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1477 int32_t key1Length, key2Length;
1478 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1479 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1480 uint8_t *dest;
1481 int32_t minCapacity = key1Length + key2Length;
1482 if(key1Length > 0) { --minCapacity; }
1483 if(minCapacity <= mergedKeyCapacity) {
1484 dest = mergedKey.getAlias();
1485 } else {
1486 if(minCapacity <= 200) {
1487 mergedKeyCapacity = 200;
1488 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1489 mergedKeyCapacity *= 2;
1490 } else {
1491 mergedKeyCapacity = minCapacity;
1492 }
1493 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1494 }
1495 U_ASSERT(dest != nullptr || mergedKeyCapacity == 0);
1496 if(key1Length == 0) {
1497 // key2 is the sort key for the first segment.
1498 uprv_memcpy(dest, key2Bytes, key2Length);
1499 mergedKeyLength = key2Length;
1500 } else {
1501 mergedKeyLength =
1502 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1503 dest, mergedKeyCapacity);
1504 }
1505 if(i == sLength) { break; }
1506 segmentStart = ++i;
1507 }
1508 key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1509 return true;
1510 }
1511
1512 namespace {
1513
1514 /**
1515 * Replaces unpaired surrogates with U+FFFD.
1516 * Returns s if no replacement was made, otherwise buffer.
1517 */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1518 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1519 int32_t i = 0;
1520 while(i < s.length()) {
1521 UChar32 c = s.char32At(i);
1522 if(U_IS_SURROGATE(c)) {
1523 if(buffer.length() < i) {
1524 buffer.append(s, buffer.length(), i - buffer.length());
1525 }
1526 buffer.append((char16_t)0xfffd);
1527 }
1528 i += U16_LENGTH(c);
1529 }
1530 if(buffer.isEmpty()) {
1531 return s;
1532 }
1533 if(buffer.length() < i) {
1534 buffer.append(s, buffer.length(), i - buffer.length());
1535 }
1536 return buffer;
1537 }
1538
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1539 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1540 UCollationResult order, UBool collHasCaseLevel) {
1541 if(order == UCOL_EQUAL) {
1542 return Collation::NO_LEVEL;
1543 }
1544 int32_t prevKeyLength;
1545 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1546 int32_t keyLength;
1547 const uint8_t *bytes = key.getByteArray(keyLength);
1548 int32_t level = Collation::PRIMARY_LEVEL;
1549 for(int32_t i = 0;; ++i) {
1550 uint8_t b = prevBytes[i];
1551 if(b != bytes[i]) { break; }
1552 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1553 ++level;
1554 if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1555 ++level;
1556 }
1557 }
1558 }
1559 return level;
1560 }
1561
1562 }
1563
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1564 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1565 const UnicodeString &prevString, const UnicodeString &s,
1566 UCollationResult expectedOrder, Collation::Level expectedLevel,
1567 IcuTestErrorCode &errorCode) {
1568 if(errorCode.isFailure()) { return false; }
1569
1570 // Get the sort keys first, for error debug output.
1571 CollationKey prevKey;
1572 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1573 prevKey, errorCode)) {
1574 return false;
1575 }
1576 CollationKey key;
1577 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return false; }
1578
1579 UCollationResult order = coll->compare(prevString, s, errorCode);
1580 if(order != expectedOrder || errorCode.isFailure()) {
1581 infoln(fileTestName);
1582 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1583 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1584 infoln(prevFileLine);
1585 infoln(fileLine);
1586 infoln(printCollationKey(prevKey));
1587 infoln(printCollationKey(key));
1588 return false;
1589 }
1590 order = coll->compare(s, prevString, errorCode);
1591 if(order != -expectedOrder || errorCode.isFailure()) {
1592 infoln(fileTestName);
1593 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1594 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1595 infoln(prevFileLine);
1596 infoln(fileLine);
1597 infoln(printCollationKey(prevKey));
1598 infoln(printCollationKey(key));
1599 return false;
1600 }
1601 // Test NUL-termination if the strings do not contain NUL characters.
1602 UBool containNUL = prevString.indexOf((char16_t)0) >= 0 || s.indexOf((char16_t)0) >= 0;
1603 if(!containNUL) {
1604 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1605 if(order != expectedOrder || errorCode.isFailure()) {
1606 infoln(fileTestName);
1607 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1608 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1609 infoln(prevFileLine);
1610 infoln(fileLine);
1611 infoln(printCollationKey(prevKey));
1612 infoln(printCollationKey(key));
1613 return false;
1614 }
1615 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1616 if(order != -expectedOrder || errorCode.isFailure()) {
1617 infoln(fileTestName);
1618 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1619 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1620 infoln(prevFileLine);
1621 infoln(fileLine);
1622 infoln(printCollationKey(prevKey));
1623 infoln(printCollationKey(key));
1624 return false;
1625 }
1626 }
1627
1628 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1629 // Unpaired surrogates cannot be converted to UTF-8.
1630 // Create valid UTF-16 strings if necessary, and use those for
1631 // both the expected compare() result and for the input to compare(UTF-8).
1632 UnicodeString prevBuffer, sBuffer;
1633 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1634 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1635 std::string prevUTF8, sUTF8;
1636 UnicodeString(prevValid).toUTF8String(prevUTF8);
1637 UnicodeString(sValid).toUTF8String(sUTF8);
1638 UCollationResult expectedUTF8Order;
1639 if(&prevValid == &prevString && &sValid == &s) {
1640 expectedUTF8Order = expectedOrder;
1641 } else {
1642 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1643 }
1644
1645 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1646 if(order != expectedUTF8Order || errorCode.isFailure()) {
1647 infoln(fileTestName);
1648 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1649 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1650 infoln(prevFileLine);
1651 infoln(fileLine);
1652 infoln(printCollationKey(prevKey));
1653 infoln(printCollationKey(key));
1654 return false;
1655 }
1656 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1657 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1658 infoln(fileTestName);
1659 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1660 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1661 infoln(prevFileLine);
1662 infoln(fileLine);
1663 infoln(printCollationKey(prevKey));
1664 infoln(printCollationKey(key));
1665 return false;
1666 }
1667 // Test NUL-termination if the strings do not contain NUL characters.
1668 if(!containNUL) {
1669 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1670 if(order != expectedUTF8Order || errorCode.isFailure()) {
1671 infoln(fileTestName);
1672 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1673 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1674 infoln(prevFileLine);
1675 infoln(fileLine);
1676 infoln(printCollationKey(prevKey));
1677 infoln(printCollationKey(key));
1678 return false;
1679 }
1680 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1681 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1682 infoln(fileTestName);
1683 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1684 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1685 infoln(prevFileLine);
1686 infoln(fileLine);
1687 infoln(printCollationKey(prevKey));
1688 infoln(printCollationKey(key));
1689 return false;
1690 }
1691 }
1692
1693 UCharIterator leftIter;
1694 UCharIterator rightIter;
1695 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1696 uiter_setString(&rightIter, s.getBuffer(), s.length());
1697 order = coll->compare(leftIter, rightIter, errorCode);
1698 if(order != expectedOrder || errorCode.isFailure()) {
1699 infoln(fileTestName);
1700 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1701 "wrong order: %d != %d (%s)",
1702 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1703 infoln(prevFileLine);
1704 infoln(fileLine);
1705 infoln(printCollationKey(prevKey));
1706 infoln(printCollationKey(key));
1707 return false;
1708 }
1709
1710 order = prevKey.compareTo(key, errorCode);
1711 if(order != expectedOrder || errorCode.isFailure()) {
1712 infoln(fileTestName);
1713 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1714 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1715 infoln(prevFileLine);
1716 infoln(fileLine);
1717 infoln(printCollationKey(prevKey));
1718 infoln(printCollationKey(key));
1719 return false;
1720 }
1721 UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1722 int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1723 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1724 if(level != expectedLevel) {
1725 infoln(fileTestName);
1726 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1727 (int)fileLineNumber, norm, order, level, expectedLevel);
1728 infoln(prevFileLine);
1729 infoln(fileLine);
1730 infoln(printCollationKey(prevKey));
1731 infoln(printCollationKey(key));
1732 return false;
1733 }
1734 }
1735
1736 // If either string contains U+FFFE, then their sort keys must compare the same as
1737 // the merged sort keys of each string's between-FFFE segments.
1738 //
1739 // It is not required that
1740 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1741 // only that those two methods yield the same order.
1742 //
1743 // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1744 if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1745 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1746 errorCode.isFailure()) {
1747 order = prevKey.compareTo(key, errorCode);
1748 if(order != expectedOrder || errorCode.isFailure()) {
1749 infoln(fileTestName);
1750 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1751 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1752 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1753 infoln(prevFileLine);
1754 infoln(fileLine);
1755 infoln(printCollationKey(prevKey));
1756 infoln(printCollationKey(key));
1757 return false;
1758 }
1759 int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1760 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1761 if(mergedLevel != level) {
1762 infoln(fileTestName);
1763 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1764 "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1765 (int)fileLineNumber, norm, order, mergedLevel, level);
1766 infoln(prevFileLine);
1767 infoln(fileLine);
1768 infoln(printCollationKey(prevKey));
1769 infoln(printCollationKey(key));
1770 return false;
1771 }
1772 }
1773 }
1774 return true;
1775 }
1776
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1777 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1778 if(errorCode.isFailure()) { return; }
1779 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1780 UnicodeString prevString, s;
1781 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1782 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1783 // Parse the line even if it will be ignored (when we do not have a Collator)
1784 // in order to report syntax issues.
1785 Collation::Level relation = parseRelationAndString(s, errorCode);
1786 if(errorCode.isFailure()) {
1787 errorCode.reset();
1788 break;
1789 }
1790 if(coll == nullptr) {
1791 // We were unable to create the Collator but continue with tests.
1792 // Ignore test data for this Collator.
1793 // The next Collator creation might work.
1794 continue;
1795 }
1796 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1797 Collation::Level expectedLevel = relation;
1798 s.getTerminatedBuffer(); // Ensure NUL-termination.
1799 UBool isOk = true;
1800 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1801 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1802 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1803 expectedOrder, expectedLevel, errorCode);
1804 }
1805 if(isOk) {
1806 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1807 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1808 expectedOrder, expectedLevel, errorCode);
1809 }
1810 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1811 UnicodeString pn = nfd->normalize(prevString, errorCode);
1812 UnicodeString n = nfd->normalize(s, errorCode);
1813 pn.getTerminatedBuffer();
1814 n.getTerminatedBuffer();
1815 errorCode.assertSuccess();
1816 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1817 expectedOrder, expectedLevel, errorCode);
1818 }
1819 if(!isOk) {
1820 errorCode.reset(); // already reported
1821 }
1822 prevFileLine = fileLine;
1823 prevString = s;
1824 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1825 }
1826 }
1827
TestDataDriven()1828 void CollationTest::TestDataDriven() {
1829 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1830
1831 fcd = Normalizer2Factory::getFCDInstance(errorCode);
1832 nfd = Normalizer2::getNFDInstance(errorCode);
1833 if(errorCode.errDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1834 return;
1835 }
1836
1837 CharString path(getSourceTestData(errorCode), errorCode);
1838 path.appendPathPart("collationtest.txt", errorCode);
1839 const char *codePage = "UTF-8";
1840 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, true, false, errorCode));
1841 if(errorCode.errIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1842 return;
1843 }
1844 // Read a new line if necessary.
1845 // Sub-parsers leave the first line set that they do not handle.
1846 while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1847 if(!isSectionStarter(fileLine[0])) {
1848 errln("syntax error on line %d", (int)fileLineNumber);
1849 infoln(fileLine);
1850 return;
1851 }
1852 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1853 fileTestName = fileLine;
1854 logln(fileLine);
1855 fileLine.remove();
1856 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1857 setRootCollator(errorCode);
1858 fileLine.remove();
1859 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1860 setLocaleCollator(errorCode);
1861 fileLine.remove();
1862 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1863 buildTailoring(f.getAlias(), errorCode);
1864 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1865 parseAndSetAttribute(errorCode);
1866 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1867 checkCompareStrings(f.getAlias(), errorCode);
1868 } else {
1869 errln("syntax error on line %d", (int)fileLineNumber);
1870 infoln(fileLine);
1871 return;
1872 }
1873 }
1874 }
1875
TestLongLocale()1876 void CollationTest::TestLongLocale() {
1877 IcuTestErrorCode errorCode(*this, "TestLongLocale");
1878 Locale longLocale("sie__1G_C_CEIE_CEZCX_CSUE_E_EIESZNI2_GB_LM_LMCSUE_LMCSX_"
1879 "LVARIANT_MMCSIE_STEU_SU1GCEIE_SU6G_SU6SU6G_U_UBGE_UC_"
1880 "UCEZCSI_UCIE_UZSIU_VARIANT_X@collation=bcs-ukvsz");
1881 LocalPointer<Collator> coll(Collator::createInstance(longLocale, errorCode));
1882 }
1883
TestHang22414()1884 void CollationTest::TestHang22414() {
1885 IcuTestErrorCode errorCode(*this, "TestHang22414");
1886 const char* cases[] = {
1887 "en", // just make sure the code work.
1888 // The following hang before fixing ICU-22414
1889 "sr-Latn-TH-t-su-BM-u-co-private-unihan-x-lvariant-zxsuhc-vss-vjf-0-kn-"
1890 "uaktmtca-uce66u-vtcb1ik-ubsuuuk8-u3iucls-ue38925l-vau30i-u6uccttg-"
1891 "u1iuylik-u-ueein-zzzz",
1892 };
1893 for(int32_t i = 0; i < UPRV_LENGTHOF(cases); i ++) {
1894 icu::Locale l = icu::Locale::forLanguageTag(cases[i], errorCode);
1895 // Make sure the following won't hang.
1896 LocalPointer<Collator> coll(Collator::createInstance(l, errorCode));
1897 errorCode.reset();
1898 }
1899 }
TestBuilderContextsOverflow()1900 void CollationTest::TestBuilderContextsOverflow() {
1901 IcuTestErrorCode errorCode(*this, "TestBuilderContextsOverflow");
1902 // ICU-20715: Bad memory access in what looks like a bogus CharsTrie after
1903 // intermediate contextual-mappings data overflowed.
1904 // Caused by the CollationDataBuilder using some outdated values when building
1905 // contextual mappings with both prefix and contraction matching.
1906 // Fixed by resetting those outdated values before code looks at them.
1907 char16_t rules[] = {
1908 u'&', 0x10, 0x2ff, 0x503c, 0x4617,
1909 u'=', 0x80, 0x4f7f, 0xff, 0x3c3d, 0x1c4f, 0x3c3c,
1910 u'<', 0, 0, 0, 0, u'|', 0, 0, 0, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f, 0xff,
1911 u'=', 0, u'|', 0, 0, 0, 0, 0, 0, 0x1f00, 0xe30,
1912 0x3035, 0, 0, 0xd200, 0, 0x7f00, 0xff4f, 0x3d00, 0, 0x7c00,
1913 0, 0, 0, 0, 0, 0, 0, 0x301f, 0x350e, 0x30,
1914 0, 0, 0xd2, 0x7c00, 0, 0, 0, 0, 0, 0,
1915 0, 0x301f, 0x350e, 0x30, 0, 0, 0x52d2, 0x2f3c, 0x5552, 0x493c,
1916 0x1f10, 0x1f50, 0x300, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f,
1917 0xff,
1918 u'=', 0, u'|', 0, 0, 0, 0, 0x5000, 0x4617,
1919 u'=', 0x80, 0x4f7f, 0, 0, 0xd200, 0
1920 };
1921 UnicodeString s(false, rules, UPRV_LENGTHOF(rules));
1922 LocalPointer<Collator> coll(new RuleBasedCollator(s, errorCode), errorCode);
1923 if(errorCode.isSuccess()) {
1924 logln("successfully built the Collator");
1925 }
1926 }
1927
1928 #endif // !UCONFIG_NO_COLLATION
1929