1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/stringpiece.h"
26 #include "unicode/tblcoll.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uniset.h"
29 #include "unicode/unistr.h"
30 #include "unicode/usetiter.h"
31 #include "unicode/ustring.h"
32 #include "charstr.h"
33 #include "cmemory.h"
34 #include "collation.h"
35 #include "collationdata.h"
36 #include "collationfcd.h"
37 #include "collationiterator.h"
38 #include "collationroot.h"
39 #include "collationrootelements.h"
40 #include "collationruleparser.h"
41 #include "collationweights.h"
42 #include "cstring.h"
43 #include "intltest.h"
44 #include "normalizer2impl.h"
45 #include "ucbuf.h"
46 #include "uhash.h"
47 #include "uitercollationiterator.h"
48 #include "utf16collationiterator.h"
49 #include "utf8collationiterator.h"
50 #include "uvectr32.h"
51 #include "uvectr64.h"
52 #include "writesrc.h"
53
54 class CodePointIterator;
55
56 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
57
58 class CollationTest : public IntlTest {
59 public:
CollationTest()60 CollationTest()
61 : fcd(NULL), nfd(NULL),
62 fileLineNumber(0),
63 coll(NULL) {}
64
~CollationTest()65 ~CollationTest() {
66 delete coll;
67 }
68
69 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL) override;
70
71 void TestMinMax();
72 void TestImplicits();
73 void TestNulTerminated();
74 void TestIllegalUTF8();
75 void TestShortFCDData();
76 void TestFCD();
77 void TestCollationWeights();
78 void TestRootElements();
79 void TestTailoredElements();
80 void TestDataDriven();
81 void TestLongLocale();
82 void TestBuilderContextsOverflow();
83
84 private:
85 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
86 void checkAllocWeights(CollationWeights &cw,
87 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
88 int32_t someLength, int32_t minCount);
89
90 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
91 static UnicodeString printCollationKey(const CollationKey &key);
92
93 // Helpers & fields for data-driven test.
isCROrLF(UChar c)94 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)95 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)96 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
skipSpaces(int32_t i)97 int32_t skipSpaces(int32_t i) {
98 while(isSpace(fileLine[i])) { ++i; }
99 return i;
100 }
101
102 UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
103 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
104 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
105 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
106 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
107 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
108 void setRootCollator(IcuTestErrorCode &errorCode);
109 void setLocaleCollator(IcuTestErrorCode &errorCode);
110
111 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
112
113 UBool getSortKeyParts(const UChar *s, int32_t length,
114 CharString &dest, int32_t partSize,
115 IcuTestErrorCode &errorCode);
116 UBool getCollationKey(const char *norm, const UnicodeString &line,
117 const UChar *s, int32_t length,
118 CollationKey &key, IcuTestErrorCode &errorCode);
119 UBool getMergedCollationKey(const UChar *s, int32_t length,
120 CollationKey &key, IcuTestErrorCode &errorCode);
121 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
122 const UnicodeString &prevString, const UnicodeString &s,
123 UCollationResult expectedOrder, Collation::Level expectedLevel,
124 IcuTestErrorCode &errorCode);
125 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
126
127 const Normalizer2 *fcd, *nfd;
128 UnicodeString fileLine;
129 int32_t fileLineNumber;
130 UnicodeString fileTestName;
131 Collator *coll;
132 };
133
createCollationTest()134 extern IntlTest *createCollationTest() {
135 return new CollationTest();
136 }
137
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)138 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
139 if(exec) {
140 logln("TestSuite CollationTest: ");
141 }
142 TESTCASE_AUTO_BEGIN;
143 TESTCASE_AUTO(TestMinMax);
144 TESTCASE_AUTO(TestImplicits);
145 TESTCASE_AUTO(TestNulTerminated);
146 TESTCASE_AUTO(TestIllegalUTF8);
147 TESTCASE_AUTO(TestShortFCDData);
148 TESTCASE_AUTO(TestFCD);
149 TESTCASE_AUTO(TestCollationWeights);
150 TESTCASE_AUTO(TestRootElements);
151 TESTCASE_AUTO(TestTailoredElements);
152 TESTCASE_AUTO(TestDataDriven);
153 TESTCASE_AUTO(TestLongLocale);
154 TESTCASE_AUTO(TestBuilderContextsOverflow);
155 TESTCASE_AUTO_END;
156 }
157
TestMinMax()158 void CollationTest::TestMinMax() {
159 IcuTestErrorCode errorCode(*this, "TestMinMax");
160
161 setRootCollator(errorCode);
162 if(errorCode.isFailure()) {
163 errorCode.reset();
164 return;
165 }
166 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
167 if(rbc == NULL) {
168 errln("the root collator is not a RuleBasedCollator");
169 return;
170 }
171
172 static const UChar s[2] = { 0xfffe, 0xffff };
173 UVector64 ces(errorCode);
174 rbc->internalGetCEs(UnicodeString(false, s, 2), ces, errorCode);
175 errorCode.assertSuccess();
176 if(ces.size() != 2) {
177 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
178 return;
179 }
180 int64_t ce = ces.elementAti(0);
181 int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
182 if(ce != expected) {
183 errln("CE(U+fffe)=%04lx != 02..", (long)ce);
184 }
185
186 ce = ces.elementAti(1);
187 expected = Collation::makeCE(Collation::MAX_PRIMARY);
188 if(ce != expected) {
189 errln("CE(U+ffff)=%04lx != max..", (long)ce);
190 }
191 }
192
TestImplicits()193 void CollationTest::TestImplicits() {
194 IcuTestErrorCode errorCode(*this, "TestImplicits");
195
196 const CollationData *cd = CollationRoot::getData(errorCode);
197 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
198 return;
199 }
200
201 // Implicit primary weights should be assigned for the following sets,
202 // and sort in ascending order by set and then code point.
203 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
204
205 // core Han Unified Ideographs
206 UnicodeSet coreHan("[\\p{unified_ideograph}&"
207 "[\\p{Block=CJK_Unified_Ideographs}"
208 "\\p{Block=CJK_Compatibility_Ideographs}]]",
209 errorCode);
210 // all other Unified Han ideographs
211 UnicodeSet otherHan("[\\p{unified ideograph}-"
212 "[\\p{Block=CJK_Unified_Ideographs}"
213 "\\p{Block=CJK_Compatibility_Ideographs}]]",
214 errorCode);
215 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
216 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
217
218 // Starting with CLDR 26/ICU 54, the root Han order may instead be
219 // the Unihan radical-stroke order.
220 // The tests should pass either way, so we only test the order of a small set of Han characters
221 // whose radical-stroke order is the same as their code point order.
222 UnicodeSet someHanInCPOrder(
223 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
224 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
225 errorCode);
226 UnicodeSet inOrder(someHanInCPOrder);
227 inOrder.addAll(unassigned).freeze();
228 if(errorCode.errIfFailureAndReset("UnicodeSet")) {
229 return;
230 }
231 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
232 UChar32 prev = 0;
233 uint32_t prevPrimary = 0;
234 UTF16CollationIterator ci(cd, false, NULL, NULL, NULL);
235 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
236 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
237 while(iter->next()) {
238 UChar32 c = iter->getCodepoint();
239 UnicodeString s(c);
240 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
241 int64_t ce = ci.nextCE(errorCode);
242 int64_t ce2 = ci.nextCE(errorCode);
243 if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
244 return;
245 }
246 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
247 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
248 continue;
249 }
250 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
251 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
252 (long)c, (long)(ce & 0xffffffff));
253 continue;
254 }
255 uint32_t primary = (uint32_t)(ce >> 32);
256 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
257 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
258 (long)c, (long)primary, (long)prev, (long)prevPrimary);
259 }
260 prev = c;
261 prevPrimary = primary;
262 }
263 }
264 }
265
TestNulTerminated()266 void CollationTest::TestNulTerminated() {
267 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
268 const CollationData *data = CollationRoot::getData(errorCode);
269 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
270 return;
271 }
272
273 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
274
275 UTF16CollationIterator ci1(data, false, s, s, s + 2);
276 UTF16CollationIterator ci2(data, false, s + 2, s + 2, NULL);
277 for(int32_t i = 0;; ++i) {
278 int64_t ce1 = ci1.nextCE(errorCode);
279 int64_t ce2 = ci2.nextCE(errorCode);
280 if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
281 return;
282 }
283 if(ce1 != ce2) {
284 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
285 break;
286 }
287 if(ce1 == Collation::NO_CE) { break; }
288 }
289 }
290
TestIllegalUTF8()291 void CollationTest::TestIllegalUTF8() {
292 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
293
294 setRootCollator(errorCode);
295 if(errorCode.isFailure()) {
296 errorCode.reset();
297 return;
298 }
299 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
300
301 static const StringPiece strings[] = {
302 // string with U+FFFD == illegal byte sequence
303 u8"a\uFFFDz", "a\x80z", // trail byte
304 u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form
305 u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form
306 u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800
307 u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
308 u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form
309 u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000
310 };
311
312 for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
313 StringPiece fffd(strings[i]);
314 StringPiece illegal(strings[i + 1]);
315 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
316 if(order != UCOL_EQUAL) {
317 errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
318 (int)i, order);
319 }
320 }
321 }
322
323 namespace {
324
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)325 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
326 for(UChar32 c = 0x10000; c < 0x110000;) {
327 UChar32 next = c + 0x400;
328 if(src.containsSome(c, next - 1)) {
329 dest.add(U16_LEAD(c));
330 }
331 c = next;
332 }
333 }
334
335 } // namespace
336
TestShortFCDData()337 void CollationTest::TestShortFCDData() {
338 // See CollationFCD class comments.
339 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
340 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
341 errorCode.assertSuccess();
342 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
343 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
344 UnicodeSet lccc; // actual
345 for(UChar32 c = 0; c <= 0xffff; ++c) {
346 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
347 }
348 UnicodeSet diff(expectedLccc);
349 diff.removeAll(lccc);
350 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
351 UnicodeString empty("[]");
352 UnicodeString diffString;
353 diff.toPattern(diffString, true);
354 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
355 diff = lccc;
356 diff.removeAll(expectedLccc);
357 diff.toPattern(diffString, true);
358 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, true);
359
360 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
361 if (errorCode.isSuccess()) {
362 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
363 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
364 UnicodeSet tccc; // actual
365 for(UChar32 c = 0; c <= 0xffff; ++c) {
366 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
367 }
368 diff = expectedTccc;
369 diff.removeAll(tccc);
370 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
371 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
372 diff = tccc;
373 diff.removeAll(expectedTccc);
374 diff.toPattern(diffString, true);
375 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
376 }
377 }
378
379 class CodePointIterator {
380 public:
CodePointIterator(const UChar32 * cp,int32_t length)381 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()382 void resetToStart() { pos = 0; }
next()383 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()384 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const385 int32_t getLength() const { return length; }
getIndex() const386 int getIndex() const { return (int)pos; }
387 private:
388 const UChar32 *cp;
389 int32_t length;
390 int32_t pos;
391 };
392
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)393 void CollationTest::checkFCD(const char *name,
394 CollationIterator &ci, CodePointIterator &cpi) {
395 IcuTestErrorCode errorCode(*this, "checkFCD");
396
397 // Iterate forward to the limit.
398 for(;;) {
399 UChar32 c1 = ci.nextCodePoint(errorCode);
400 UChar32 c2 = cpi.next();
401 if(c1 != c2) {
402 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
403 name, (long)c1, (long)c2, cpi.getIndex());
404 return;
405 }
406 if(c1 < 0) { break; }
407 }
408
409 // Iterate backward most of the way.
410 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
411 UChar32 c1 = ci.previousCodePoint(errorCode);
412 UChar32 c2 = cpi.previous();
413 if(c1 != c2) {
414 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
415 name, (long)c1, (long)c2, cpi.getIndex());
416 return;
417 }
418 }
419
420 // Forward again.
421 for(;;) {
422 UChar32 c1 = ci.nextCodePoint(errorCode);
423 UChar32 c2 = cpi.next();
424 if(c1 != c2) {
425 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
426 name, (long)c1, (long)c2, cpi.getIndex());
427 return;
428 }
429 if(c1 < 0) { break; }
430 }
431
432 // Iterate backward to the start.
433 for(;;) {
434 UChar32 c1 = ci.previousCodePoint(errorCode);
435 UChar32 c2 = cpi.previous();
436 if(c1 != c2) {
437 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
438 name, (long)c1, (long)c2, cpi.getIndex());
439 return;
440 }
441 if(c1 < 0) { break; }
442 }
443 }
444
TestFCD()445 void CollationTest::TestFCD() {
446 IcuTestErrorCode errorCode(*this, "TestFCD");
447 const CollationData *data = CollationRoot::getData(errorCode);
448 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
449 return;
450 }
451
452 // Input string, not FCD, NUL-terminated.
453 static const UChar s[] = {
454 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
455 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
456 0x327, 0x308, // ccc=202, 230
457 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
458 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
459 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
460 0xac01,
461 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
462 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
463 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
464 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
465 0x4e00, 0xf81,
466 0
467 };
468 // Expected code points.
469 static const UChar32 cp[] = {
470 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
471 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
472 0x1D15F, 0x1D16D,
473 0xac01,
474 0x63, 0x327, 0x1D165, 0x1D16D,
475 0x61,
476 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
477 0x4e00, 0xf71, 0xf80
478 };
479
480 FCDUTF16CollationIterator u16ci(data, false, s, s, NULL);
481 if(errorCode.errIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
482 return;
483 }
484 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
485 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
486
487 cpi.resetToStart();
488 std::string utf8;
489 UnicodeString(s).toUTF8String(utf8);
490 FCDUTF8CollationIterator u8ci(data, false,
491 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
492 if(errorCode.errIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
493 return;
494 }
495 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
496
497 cpi.resetToStart();
498 UCharIterator iter;
499 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the terminating NUL
500 FCDUIterCollationIterator uici(data, false, iter, 0);
501 if(errorCode.errIfFailureAndReset("FCDUIterCollationIterator constructor")) {
502 return;
503 }
504 checkFCD("FCDUIterCollationIterator", uici, cpi);
505 }
506
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)507 void CollationTest::checkAllocWeights(CollationWeights &cw,
508 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
509 int32_t someLength, int32_t minCount) {
510 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
511 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = false",
512 (long)lowerLimit, (long)upperLimit, (long)n);
513 return;
514 }
515 uint32_t previous = lowerLimit;
516 int32_t count = 0; // number of weights that have someLength
517 for(int32_t i = 0; i < n; ++i) {
518 uint32_t w = cw.nextWeight();
519 if(w == 0xffffffff) {
520 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
521 "returns only %ld weights",
522 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
523 return;
524 }
525 if(!(previous < w && w < upperLimit)) {
526 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
527 "number %ld -> %lx not between %lx and %lx",
528 (long)lowerLimit, (long)upperLimit, (long)n,
529 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
530 return;
531 }
532 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
533 }
534 if(count < minCount) {
535 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
536 "returns only %ld < %ld weights of length %d",
537 (long)lowerLimit, (long)upperLimit, (long)n,
538 (long)count, (long)minCount, (int)someLength);
539 }
540 }
541
TestCollationWeights()542 void CollationTest::TestCollationWeights() {
543 CollationWeights cw;
544
545 // Non-compressible primaries use 254 second bytes 02..FF.
546 logln("CollationWeights.initForPrimary(non-compressible)");
547 cw.initForPrimary(false);
548 // Expect 1 weight 11 and 254 weights 12xx.
549 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
550 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
551 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
552 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
553 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
554 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
555 // Expect 254^2=64516 three-byte weights.
556 // During computation, there should be 3 three-byte ranges
557 // 10ffff, 11xxxx, 120202.
558 // The middle one should be split 64515:1,
559 // and the newly-split-off range and the last ranged lengthened.
560 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
561 // Expect weights 1102 & 1103.
562 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
563 // Expect weights 102102 & 102103.
564 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
565
566 // Compressible primaries use 251 second bytes 04..FE.
567 logln("CollationWeights.initForPrimary(compressible)");
568 cw.initForPrimary(true);
569 // Expect 1 weight 11 and 251 weights 12xx.
570 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
571 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
572 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
573 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
574 // Expect weights 1104 & 1105.
575 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
576 // Expect weights 102102 & 102103.
577 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
578
579 // Secondary and tertiary weights use only bytes 3 & 4.
580 logln("CollationWeights.initForSecondary()");
581 cw.initForSecondary();
582 // Expect weights fbxx and all four fc..ff.
583 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
584
585 logln("CollationWeights.initForTertiary()");
586 cw.initForTertiary();
587 // Expect weights 3dxx and both 3e & 3f.
588 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
589 }
590
591 namespace {
592
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)593 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
594 uint32_t p, uint32_t s, uint32_t ctq) {
595 uint32_t p1 = p >> 24;
596 uint32_t p2 = (p >> 16) & 0xff;
597 uint32_t p3 = (p >> 8) & 0xff;
598 uint32_t p4 = p & 0xff;
599 uint32_t s1 = s >> 8;
600 uint32_t s2 = s & 0xff;
601 // ctq = Case, Tertiary, Quaternary
602 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
603 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
604 uint32_t t1 = t >> 8;
605 uint32_t t2 = t & 0xff;
606 uint32_t q = ctq & Collation::QUATERNARY_MASK;
607 // No leading zero bytes.
608 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
609 return false;
610 }
611 // No intermediate zero bytes.
612 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
613 return false;
614 }
615 if(p2 != 0 && p3 == 0 && p4 != 0) {
616 return false;
617 }
618 // Minimum & maximum lead bytes.
619 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
620 s1 == Collation::LEVEL_SEPARATOR_BYTE ||
621 t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
622 return false;
623 }
624 if(c > 2) {
625 return false;
626 }
627 // The valid byte range for the second primary byte depends on compressibility.
628 if(p2 != 0) {
629 if(data.isCompressibleLeadByte(p1)) {
630 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
631 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
632 return false;
633 }
634 } else {
635 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
636 return false;
637 }
638 }
639 }
640 // Other bytes just need to avoid the level separator.
641 // Trailing zeros are ok.
642 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
643 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
644 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
645 return false;
646 }
647 // Well-formed CEs.
648 if(p == 0) {
649 if(s == 0) {
650 if(t == 0) {
651 // Completely ignorable CE.
652 // Quaternary CEs are not supported.
653 if(c != 0 || q != 0) {
654 return false;
655 }
656 } else {
657 // Tertiary CE.
658 if(t < re.getTertiaryBoundary() || c != 2) {
659 return false;
660 }
661 }
662 } else {
663 // Secondary CE.
664 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
665 return false;
666 }
667 }
668 } else {
669 // Primary CE.
670 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
671 s >= re.getSecondaryBoundary()) {
672 return false;
673 }
674 if(t == 0 || t >= re.getTertiaryBoundary()) {
675 return false;
676 }
677 }
678 return true;
679 }
680
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)681 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
682 uint32_t p = (uint32_t)(ce >> 32);
683 uint32_t secTer = (uint32_t)ce;
684 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
685 }
686
687 class RootElementsIterator {
688 public:
RootElementsIterator(const CollationData & root)689 RootElementsIterator(const CollationData &root)
690 : data(root),
691 elements(root.rootElements), length(root.rootElementsLength),
692 pri(0), secTer(0),
693 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
694
next()695 UBool next() {
696 if(index >= length) { return false; }
697 uint32_t p = elements[index];
698 if(p == CollationRootElements::PRIMARY_SENTINEL) { return false; }
699 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
700 ++index;
701 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
702 return true;
703 }
704 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
705 // End of a range, enumerate the primaries in the range.
706 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
707 p &= 0xffffff00;
708 if(pri == p) {
709 // Finished the range, return the next CE after it.
710 ++index;
711 return next();
712 }
713 U_ASSERT(pri < p);
714 // Return the next primary in this range.
715 UBool isCompressible = data.isCompressiblePrimary(pri);
716 if((pri & 0xffff) == 0) {
717 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
718 } else {
719 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
720 }
721 return true;
722 }
723 // Simple primary CE.
724 ++index;
725 pri = p;
726 // Does this have an explicit below-common sec/ter unit,
727 // or does it imply a common one?
728 if(index == length) {
729 secTer = Collation::COMMON_SEC_AND_TER_CE;
730 } else {
731 secTer = elements[index];
732 if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
733 // No sec/ter delta.
734 secTer = Collation::COMMON_SEC_AND_TER_CE;
735 } else {
736 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
737 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
738 // Implied sec/ter.
739 secTer = Collation::COMMON_SEC_AND_TER_CE;
740 } else {
741 // Explicit sec/ter below common/common.
742 ++index;
743 }
744 }
745 }
746 return true;
747 }
748
getPrimary() const749 uint32_t getPrimary() const { return pri; }
getSecTer() const750 uint32_t getSecTer() const { return secTer; }
751
752 private:
753 const CollationData &data;
754 const uint32_t *elements;
755 int32_t length;
756
757 uint32_t pri;
758 uint32_t secTer;
759 int32_t index;
760 };
761
762 } // namespace
763
TestRootElements()764 void CollationTest::TestRootElements() {
765 IcuTestErrorCode errorCode(*this, "TestRootElements");
766 const CollationData *root = CollationRoot::getData(errorCode);
767 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
768 return;
769 }
770 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
771 RootElementsIterator iter(*root);
772
773 // We check each root CE for validity,
774 // and we also verify that there is a tailoring gap between each two CEs.
775 CollationWeights cw1c; // compressible primary weights
776 CollationWeights cw1u; // uncompressible primary weights
777 CollationWeights cw2;
778 CollationWeights cw3;
779
780 cw1c.initForPrimary(true);
781 cw1u.initForPrimary(false);
782 cw2.initForSecondary();
783 cw3.initForTertiary();
784
785 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
786 // nor the special merge-separator CE for U+FFFE.
787 uint32_t prevPri = 0;
788 uint32_t prevSec = 0;
789 uint32_t prevTer = 0;
790 while(iter.next()) {
791 uint32_t pri = iter.getPrimary();
792 uint32_t secTer = iter.getSecTer();
793 // CollationRootElements CEs must have 0 case and quaternary bits.
794 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
795 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
796 (long)pri, (long)secTer);
797 }
798 uint32_t sec = secTer >> 16;
799 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
800 uint32_t ctq = ter;
801 if(pri == 0 && sec == 0 && ter != 0) {
802 // Tertiary CEs must have uppercase bits,
803 // but they are not stored in the CollationRootElements.
804 ctq |= 0x8000;
805 }
806 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
807 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
808 } else {
809 if(pri != prevPri) {
810 uint32_t newWeight = 0;
811 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
812 // There is currently no tailoring gap after primary ignorables,
813 // and we forbid tailoring after U+FFFD and U+FFFF.
814 } else if(root->isCompressiblePrimary(prevPri)) {
815 if(!cw1c.allocWeights(prevPri, pri, 1)) {
816 errln("no primary/compressible tailoring gap between %08lx and %08lx",
817 (long)prevPri, (long)pri);
818 } else {
819 newWeight = cw1c.nextWeight();
820 }
821 } else {
822 if(!cw1u.allocWeights(prevPri, pri, 1)) {
823 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
824 (long)prevPri, (long)pri);
825 } else {
826 newWeight = cw1u.nextWeight();
827 }
828 }
829 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
830 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
831 (long)prevPri, (long)newWeight, (long)pri);
832 }
833 } else if(sec != prevSec) {
834 uint32_t lowerLimit =
835 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
836 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
837 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
838 } else {
839 uint32_t newWeight = cw2.nextWeight();
840 if(!(prevSec < newWeight && newWeight < sec)) {
841 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
842 (long)lowerLimit, (long)newWeight, (long)sec);
843 }
844 }
845 } else if(ter != prevTer) {
846 uint32_t lowerLimit =
847 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
848 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
849 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
850 } else {
851 uint32_t newWeight = cw3.nextWeight();
852 if(!(prevTer < newWeight && newWeight < ter)) {
853 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
854 (long)lowerLimit, (long)newWeight, (long)ter);
855 }
856 }
857 } else {
858 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
859 }
860 }
861 prevPri = pri;
862 prevSec = sec;
863 prevTer = ter;
864 }
865 }
866
TestTailoredElements()867 void CollationTest::TestTailoredElements() {
868 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
869 const CollationData *root = CollationRoot::getData(errorCode);
870 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
871 return;
872 }
873 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
874
875 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
876 if(errorCode.errIfFailureAndReset("failed to create a hash table")) {
877 return;
878 }
879 uhash_setKeyDeleter(prevLocales, uprv_free);
880 // TestRootElements() tests the root collator which does not have tailorings.
881 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
882 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
883 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
884
885 UVector64 ces(errorCode);
886 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
887 U_ASSERT(locales.isValid());
888 const char *localeID = "root";
889 do {
890 Locale locale(localeID);
891 LocalPointer<StringEnumeration> types(
892 Collator::getKeywordValuesForLocale("collation", locale, false, errorCode));
893 errorCode.assertSuccess();
894 const char *type; // first: default type
895 while((type = types->next(NULL, errorCode)) != NULL) {
896 if(strncmp(type, "private-", 8) == 0) {
897 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
898 localeID, type);
899 }
900 Locale localeWithType(locale);
901 localeWithType.setKeywordValue("collation", type, errorCode);
902 errorCode.assertSuccess();
903 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
904 if(errorCode.errIfFailureAndReset("Collator::createInstance(%s)",
905 localeWithType.getName())) {
906 continue;
907 }
908 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
909 if(uhash_geti(prevLocales, actual.getName()) != 0) {
910 continue;
911 }
912 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
913 errorCode.assertSuccess();
914 logln("TestTailoredElements(): requested %s -> actual %s",
915 localeWithType.getName(), actual.getName());
916 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
917 if(rbc == NULL) {
918 continue;
919 }
920 // Note: It would be better to get tailored strings such that we can
921 // identify the prefix, and only get the CEs for the prefix+string,
922 // not also for the prefix.
923 // There is currently no API for that.
924 // It would help in an unusual case where a contraction starting in the prefix
925 // extends past its end, and we do not see the intended mapping.
926 // For example, for a mapping p|st, if there is also a contraction ps,
927 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
928 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
929 errorCode.assertSuccess();
930 UnicodeSetIterator iter(*tailored);
931 while(iter.next()) {
932 const UnicodeString &s = iter.getString();
933 ces.removeAllElements();
934 rbc->internalGetCEs(s, ces, errorCode);
935 errorCode.assertSuccess();
936 for(int32_t i = 0; i < ces.size(); ++i) {
937 int64_t ce = ces.elementAti(i);
938 if(!isValidCE(rootElements, *root, ce)) {
939 errln("invalid tailored CE %016llx at CE index %d from string:",
940 (long long)ce, (int)i);
941 infoln(prettify(s));
942 }
943 }
944 }
945 }
946 } while((localeID = locales->next(NULL, errorCode)) != NULL);
947 uhash_close(prevLocales);
948 }
949
printSortKey(const uint8_t * p,int32_t length)950 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
951 UnicodeString s;
952 for(int32_t i = 0; i < length; ++i) {
953 if(i > 0) { s.append((UChar)0x20); }
954 uint8_t b = p[i];
955 if(b == 0) {
956 s.append((UChar)0x2e); // period
957 } else if(b == 1) {
958 s.append((UChar)0x7c); // vertical bar
959 } else {
960 appendHex(b, 2, s);
961 }
962 }
963 return s;
964 }
965
printCollationKey(const CollationKey & key)966 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
967 int32_t length;
968 const uint8_t *p = key.getByteArray(length);
969 return printSortKey(p, length);
970 }
971
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)972 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
973 for(;;) {
974 int32_t lineLength;
975 const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
976 if(line == NULL || errorCode.isFailure()) {
977 fileLine.remove();
978 return false;
979 }
980 ++fileLineNumber;
981 // Strip trailing CR/LF, comments, and spaces.
982 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
983 if(comment != NULL) {
984 lineLength = (int32_t)(comment - line);
985 } else {
986 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
987 }
988 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
989 if(lineLength != 0) {
990 fileLine.setTo(false, line, lineLength);
991 return true;
992 }
993 // Empty line, continue.
994 }
995 }
996
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)997 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
998 UErrorCode &errorCode) {
999 int32_t length = fileLine.length();
1000 int32_t i;
1001 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
1002 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|'
1003 if(pipeIndex >= 0) {
1004 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1005 if(prefix.isEmpty()) {
1006 errln("empty prefix on line %d", (int)fileLineNumber);
1007 infoln(fileLine);
1008 errorCode = U_PARSE_ERROR;
1009 return;
1010 }
1011 start = pipeIndex + 1;
1012 } else {
1013 prefix.remove();
1014 }
1015 s = fileLine.tempSubStringBetween(start, i).unescape();
1016 if(s.isEmpty()) {
1017 errln("empty string on line %d", (int)fileLineNumber);
1018 infoln(fileLine);
1019 errorCode = U_PARSE_ERROR;
1020 return;
1021 }
1022 start = i;
1023 }
1024
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1025 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1026 Collation::Level relation;
1027 int32_t start;
1028 if(fileLine[0] == 0x3c) { // <
1029 UChar second = fileLine[1];
1030 start = 2;
1031 switch(second) {
1032 case 0x31: // <1
1033 relation = Collation::PRIMARY_LEVEL;
1034 break;
1035 case 0x32: // <2
1036 relation = Collation::SECONDARY_LEVEL;
1037 break;
1038 case 0x33: // <3
1039 relation = Collation::TERTIARY_LEVEL;
1040 break;
1041 case 0x34: // <4
1042 relation = Collation::QUATERNARY_LEVEL;
1043 break;
1044 case 0x63: // <c
1045 relation = Collation::CASE_LEVEL;
1046 break;
1047 case 0x69: // <i
1048 relation = Collation::IDENTICAL_LEVEL;
1049 break;
1050 default: // just <
1051 relation = Collation::NO_LEVEL;
1052 start = 1;
1053 break;
1054 }
1055 } else if(fileLine[0] == 0x3d) { // =
1056 relation = Collation::ZERO_LEVEL;
1057 start = 1;
1058 } else {
1059 start = 0;
1060 }
1061 if(start == 0 || !isSpace(fileLine[start])) {
1062 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1063 infoln(fileLine);
1064 errorCode.set(U_PARSE_ERROR);
1065 return Collation::NO_LEVEL;
1066 }
1067 start = skipSpaces(start);
1068 UnicodeString prefix;
1069 parseString(start, prefix, s, errorCode);
1070 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1071 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1072 infoln(fileLine);
1073 errorCode.set(U_PARSE_ERROR);
1074 return Collation::NO_LEVEL;
1075 }
1076 if(start < fileLine.length()) {
1077 errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1078 infoln(fileLine);
1079 errorCode.set(U_PARSE_ERROR);
1080 return Collation::NO_LEVEL;
1081 }
1082 return relation;
1083 }
1084
1085 static const struct {
1086 const char *name;
1087 UColAttribute attr;
1088 } attributes[] = {
1089 { "backwards", UCOL_FRENCH_COLLATION },
1090 { "alternate", UCOL_ALTERNATE_HANDLING },
1091 { "caseFirst", UCOL_CASE_FIRST },
1092 { "caseLevel", UCOL_CASE_LEVEL },
1093 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1094 { "strength", UCOL_STRENGTH },
1095 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1096 { "numeric", UCOL_NUMERIC_COLLATION }
1097 };
1098
1099 static const struct {
1100 const char *name;
1101 UColAttributeValue value;
1102 } attributeValues[] = {
1103 { "default", UCOL_DEFAULT },
1104 { "primary", UCOL_PRIMARY },
1105 { "secondary", UCOL_SECONDARY },
1106 { "tertiary", UCOL_TERTIARY },
1107 { "quaternary", UCOL_QUATERNARY },
1108 { "identical", UCOL_IDENTICAL },
1109 { "off", UCOL_OFF },
1110 { "on", UCOL_ON },
1111 { "shifted", UCOL_SHIFTED },
1112 { "non-ignorable", UCOL_NON_IGNORABLE },
1113 { "lower", UCOL_LOWER_FIRST },
1114 { "upper", UCOL_UPPER_FIRST }
1115 };
1116
parseAndSetAttribute(IcuTestErrorCode & errorCode)1117 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1118 // Parse attributes even if the Collator could not be created,
1119 // in order to report syntax errors.
1120 int32_t start = skipSpaces(1);
1121 int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1122 if(equalPos < 0) {
1123 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1124 parseAndSetReorderCodes(start + 7, errorCode);
1125 return;
1126 }
1127 errln("missing '=' on line %d", (int)fileLineNumber);
1128 infoln(fileLine);
1129 errorCode.set(U_PARSE_ERROR);
1130 return;
1131 }
1132
1133 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1134 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1135 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1136 UColReorderCode max;
1137 if(valueString == UNICODE_STRING("space", 5)) {
1138 max = UCOL_REORDER_CODE_SPACE;
1139 } else if(valueString == UNICODE_STRING("punct", 5)) {
1140 max = UCOL_REORDER_CODE_PUNCTUATION;
1141 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1142 max = UCOL_REORDER_CODE_SYMBOL;
1143 } else if(valueString == UNICODE_STRING("currency", 8)) {
1144 max = UCOL_REORDER_CODE_CURRENCY;
1145 } else {
1146 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1147 infoln(fileLine);
1148 errorCode.set(U_PARSE_ERROR);
1149 return;
1150 }
1151 if(coll != NULL) {
1152 coll->setMaxVariable(max, errorCode);
1153 if(errorCode.isFailure()) {
1154 errln("setMaxVariable() failed on line %d: %s",
1155 (int)fileLineNumber, errorCode.errorName());
1156 infoln(fileLine);
1157 return;
1158 }
1159 }
1160 fileLine.remove();
1161 return;
1162 }
1163
1164 UColAttribute attr;
1165 for(int32_t i = 0;; ++i) {
1166 if(i == UPRV_LENGTHOF(attributes)) {
1167 errln("invalid attribute name on line %d", (int)fileLineNumber);
1168 infoln(fileLine);
1169 errorCode.set(U_PARSE_ERROR);
1170 return;
1171 }
1172 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1173 attr = attributes[i].attr;
1174 break;
1175 }
1176 }
1177
1178 UColAttributeValue value;
1179 for(int32_t i = 0;; ++i) {
1180 if(i == UPRV_LENGTHOF(attributeValues)) {
1181 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1182 infoln(fileLine);
1183 errorCode.set(U_PARSE_ERROR);
1184 return;
1185 }
1186 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1187 value = attributeValues[i].value;
1188 break;
1189 }
1190 }
1191
1192 if(coll != NULL) {
1193 coll->setAttribute(attr, value, errorCode);
1194 if(errorCode.isFailure()) {
1195 errln("illegal attribute=value combination on line %d: %s",
1196 (int)fileLineNumber, errorCode.errorName());
1197 infoln(fileLine);
1198 return;
1199 }
1200 }
1201 fileLine.remove();
1202 }
1203
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1204 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1205 UVector32 reorderCodes(errorCode);
1206 while(start < fileLine.length()) {
1207 start = skipSpaces(start);
1208 int32_t limit = start;
1209 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1210 CharString name;
1211 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1212 int32_t code = CollationRuleParser::getReorderCode(name.data());
1213 if(code < 0) {
1214 if(uprv_stricmp(name.data(), "default") == 0) {
1215 code = UCOL_REORDER_CODE_DEFAULT; // -1
1216 } else {
1217 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1218 infoln(fileLine);
1219 errorCode.set(U_PARSE_ERROR);
1220 return;
1221 }
1222 }
1223 reorderCodes.addElement(code, errorCode);
1224 start = limit;
1225 }
1226 if(coll != NULL) {
1227 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1228 if(errorCode.isFailure()) {
1229 errln("setReorderCodes() failed on line %d: %s",
1230 (int)fileLineNumber, errorCode.errorName());
1231 infoln(fileLine);
1232 return;
1233 }
1234 }
1235 fileLine.remove();
1236 }
1237
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1238 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1239 UnicodeString rules;
1240 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1241 rules.append(fileLine.unescape());
1242 }
1243 if(errorCode.isFailure()) { return; }
1244 logln(rules);
1245
1246 UParseError parseError;
1247 UnicodeString reason;
1248 delete coll;
1249 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1250 if(coll == NULL) {
1251 errln("unable to allocate a new collator");
1252 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1253 return;
1254 }
1255 if(errorCode.isFailure()) {
1256 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1257 infoln(UnicodeString(" reason: ") + reason);
1258 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); }
1259 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1260 infoln(UnicodeString(" snippet: ...") +
1261 parseError.preContext + "(!)" + parseError.postContext + "...");
1262 }
1263 delete coll;
1264 coll = NULL;
1265 errorCode.reset();
1266 } else {
1267 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1268 UnicodeString(), reason);
1269 }
1270 }
1271
setRootCollator(IcuTestErrorCode & errorCode)1272 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1273 if(errorCode.isFailure()) { return; }
1274 delete coll;
1275 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1276 if(errorCode.isFailure()) {
1277 dataerrln("unable to create a root collator");
1278 return;
1279 }
1280 }
1281
setLocaleCollator(IcuTestErrorCode & errorCode)1282 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1283 if(errorCode.isFailure()) { return; }
1284 delete coll;
1285 coll = NULL;
1286 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant
1287 if(at >= 0) {
1288 fileLine.setCharAt(at, (UChar)0x2a); // *
1289 }
1290 CharString localeID;
1291 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1292 if(at >= 0) {
1293 localeID.data()[at - 9] = '@';
1294 }
1295 Locale locale(localeID.data());
1296 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1297 errln("invalid language tag on line %d", (int)fileLineNumber);
1298 infoln(fileLine);
1299 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1300 return;
1301 }
1302
1303 logln("creating a collator for locale ID %s", locale.getName());
1304 coll = Collator::createInstance(locale, errorCode);
1305 if(errorCode.isFailure()) {
1306 dataerrln("unable to create a collator for locale %s on line %d",
1307 locale.getName(), (int)fileLineNumber);
1308 infoln(fileLine);
1309 delete coll;
1310 coll = NULL;
1311 errorCode.reset();
1312 }
1313 }
1314
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1315 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1316 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return true; }
1317 // In some sequences with Tibetan composite vowel signs,
1318 // even if the string passes the FCD check,
1319 // those composites must be decomposed.
1320 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1321 int32_t index = 0;
1322 while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1323 if(++index < s.length()) {
1324 UChar c = s[index];
1325 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return true; }
1326 }
1327 }
1328 return false;
1329 }
1330
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1331 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1332 CharString &dest, int32_t partSize,
1333 IcuTestErrorCode &errorCode) {
1334 if(errorCode.isFailure()) { return false; }
1335 uint8_t part[32];
1336 U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1337 UCharIterator iter;
1338 uiter_setString(&iter, s, length);
1339 uint32_t state[2] = { 0, 0 };
1340 for(;;) {
1341 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1342 UBool done = partLength < partSize;
1343 if(done) {
1344 // At the end, append the next byte as well which should be 00.
1345 ++partLength;
1346 }
1347 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1348 if(done) {
1349 return errorCode.isSuccess();
1350 }
1351 }
1352 }
1353
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1354 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1355 const UChar *s, int32_t length,
1356 CollationKey &key, IcuTestErrorCode &errorCode) {
1357 if(errorCode.isFailure()) { return false; }
1358 coll->getCollationKey(s, length, key, errorCode);
1359 if(errorCode.isFailure()) {
1360 infoln(fileTestName);
1361 errln("Collator(%s).getCollationKey() failed: %s",
1362 norm, errorCode.errorName());
1363 infoln(line);
1364 return false;
1365 }
1366 int32_t keyLength;
1367 const uint8_t *keyBytes = key.getByteArray(keyLength);
1368 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1369 infoln(fileTestName);
1370 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1371 norm);
1372 infoln(line);
1373 infoln(printCollationKey(key));
1374 return false;
1375 }
1376
1377 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1378 if(numLevels < UCOL_IDENTICAL) {
1379 ++numLevels;
1380 } else {
1381 numLevels = 5;
1382 }
1383 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1384 ++numLevels;
1385 }
1386 errorCode.assertSuccess();
1387 int32_t numLevelSeparators = 0;
1388 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1389 uint8_t b = keyBytes[i];
1390 if(b == 0) {
1391 infoln(fileTestName);
1392 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1393 infoln(line);
1394 infoln(printCollationKey(key));
1395 return false;
1396 }
1397 if(b == 1) { ++numLevelSeparators; }
1398 }
1399 if(numLevelSeparators != (numLevels - 1)) {
1400 infoln(fileTestName);
1401 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1402 norm, (int)numLevelSeparators, (int)numLevels);
1403 infoln(line);
1404 infoln(printCollationKey(key));
1405 return false;
1406 }
1407
1408 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1409 static const int32_t partSizes[] = { 32, 3, 1 };
1410 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1411 int32_t partSize = partSizes[psi];
1412 CharString parts;
1413 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1414 infoln(fileTestName);
1415 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1416 norm, (int)partSize, errorCode.errorName());
1417 infoln(line);
1418 return false;
1419 }
1420 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1421 infoln(fileTestName);
1422 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1423 norm, (int)partSize);
1424 infoln(line);
1425 infoln(printCollationKey(key));
1426 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1427 return false;
1428 }
1429 }
1430 return true;
1431 }
1432
1433 /**
1434 * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1435 * Leaves key unchanged if s does not contain U+FFFE.
1436 * @return true if the key was successfully changed
1437 */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1438 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1439 CollationKey &key, IcuTestErrorCode &errorCode) {
1440 if(errorCode.isFailure()) { return false; }
1441 LocalMemory<uint8_t> mergedKey;
1442 int32_t mergedKeyLength = 0;
1443 int32_t mergedKeyCapacity = 0;
1444 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1445 int32_t segmentStart = 0;
1446 for(int32_t i = 0;;) {
1447 if(i == sLength) {
1448 if(segmentStart == 0) {
1449 // s does not contain any U+FFFE.
1450 return false;
1451 }
1452 } else if(s[i] != 0xfffe) {
1453 ++i;
1454 continue;
1455 }
1456 // Get the sort key for another segment and merge it into mergedKey.
1457 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1458 CollationKey key2;
1459 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1460 int32_t key1Length, key2Length;
1461 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1462 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1463 uint8_t *dest;
1464 int32_t minCapacity = key1Length + key2Length;
1465 if(key1Length > 0) { --minCapacity; }
1466 if(minCapacity <= mergedKeyCapacity) {
1467 dest = mergedKey.getAlias();
1468 } else {
1469 if(minCapacity <= 200) {
1470 mergedKeyCapacity = 200;
1471 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1472 mergedKeyCapacity *= 2;
1473 } else {
1474 mergedKeyCapacity = minCapacity;
1475 }
1476 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1477 }
1478 U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1479 if(key1Length == 0) {
1480 // key2 is the sort key for the first segment.
1481 uprv_memcpy(dest, key2Bytes, key2Length);
1482 mergedKeyLength = key2Length;
1483 } else {
1484 mergedKeyLength =
1485 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1486 dest, mergedKeyCapacity);
1487 }
1488 if(i == sLength) { break; }
1489 segmentStart = ++i;
1490 }
1491 key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1492 return true;
1493 }
1494
1495 namespace {
1496
1497 /**
1498 * Replaces unpaired surrogates with U+FFFD.
1499 * Returns s if no replacement was made, otherwise buffer.
1500 */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1501 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1502 int32_t i = 0;
1503 while(i < s.length()) {
1504 UChar32 c = s.char32At(i);
1505 if(U_IS_SURROGATE(c)) {
1506 if(buffer.length() < i) {
1507 buffer.append(s, buffer.length(), i - buffer.length());
1508 }
1509 buffer.append((UChar)0xfffd);
1510 }
1511 i += U16_LENGTH(c);
1512 }
1513 if(buffer.isEmpty()) {
1514 return s;
1515 }
1516 if(buffer.length() < i) {
1517 buffer.append(s, buffer.length(), i - buffer.length());
1518 }
1519 return buffer;
1520 }
1521
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1522 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1523 UCollationResult order, UBool collHasCaseLevel) {
1524 if(order == UCOL_EQUAL) {
1525 return Collation::NO_LEVEL;
1526 }
1527 int32_t prevKeyLength;
1528 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1529 int32_t keyLength;
1530 const uint8_t *bytes = key.getByteArray(keyLength);
1531 int32_t level = Collation::PRIMARY_LEVEL;
1532 for(int32_t i = 0;; ++i) {
1533 uint8_t b = prevBytes[i];
1534 if(b != bytes[i]) { break; }
1535 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1536 ++level;
1537 if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1538 ++level;
1539 }
1540 }
1541 }
1542 return level;
1543 }
1544
1545 }
1546
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1547 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1548 const UnicodeString &prevString, const UnicodeString &s,
1549 UCollationResult expectedOrder, Collation::Level expectedLevel,
1550 IcuTestErrorCode &errorCode) {
1551 if(errorCode.isFailure()) { return false; }
1552
1553 // Get the sort keys first, for error debug output.
1554 CollationKey prevKey;
1555 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1556 prevKey, errorCode)) {
1557 return false;
1558 }
1559 CollationKey key;
1560 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return false; }
1561
1562 UCollationResult order = coll->compare(prevString, s, errorCode);
1563 if(order != expectedOrder || errorCode.isFailure()) {
1564 infoln(fileTestName);
1565 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1566 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1567 infoln(prevFileLine);
1568 infoln(fileLine);
1569 infoln(printCollationKey(prevKey));
1570 infoln(printCollationKey(key));
1571 return false;
1572 }
1573 order = coll->compare(s, prevString, errorCode);
1574 if(order != -expectedOrder || errorCode.isFailure()) {
1575 infoln(fileTestName);
1576 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1577 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1578 infoln(prevFileLine);
1579 infoln(fileLine);
1580 infoln(printCollationKey(prevKey));
1581 infoln(printCollationKey(key));
1582 return false;
1583 }
1584 // Test NUL-termination if the strings do not contain NUL characters.
1585 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1586 if(!containNUL) {
1587 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1588 if(order != expectedOrder || errorCode.isFailure()) {
1589 infoln(fileTestName);
1590 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1591 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1592 infoln(prevFileLine);
1593 infoln(fileLine);
1594 infoln(printCollationKey(prevKey));
1595 infoln(printCollationKey(key));
1596 return false;
1597 }
1598 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1599 if(order != -expectedOrder || errorCode.isFailure()) {
1600 infoln(fileTestName);
1601 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1602 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1603 infoln(prevFileLine);
1604 infoln(fileLine);
1605 infoln(printCollationKey(prevKey));
1606 infoln(printCollationKey(key));
1607 return false;
1608 }
1609 }
1610
1611 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1612 // Unpaired surrogates cannot be converted to UTF-8.
1613 // Create valid UTF-16 strings if necessary, and use those for
1614 // both the expected compare() result and for the input to compare(UTF-8).
1615 UnicodeString prevBuffer, sBuffer;
1616 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1617 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1618 std::string prevUTF8, sUTF8;
1619 UnicodeString(prevValid).toUTF8String(prevUTF8);
1620 UnicodeString(sValid).toUTF8String(sUTF8);
1621 UCollationResult expectedUTF8Order;
1622 if(&prevValid == &prevString && &sValid == &s) {
1623 expectedUTF8Order = expectedOrder;
1624 } else {
1625 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1626 }
1627
1628 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1629 if(order != expectedUTF8Order || errorCode.isFailure()) {
1630 infoln(fileTestName);
1631 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1632 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1633 infoln(prevFileLine);
1634 infoln(fileLine);
1635 infoln(printCollationKey(prevKey));
1636 infoln(printCollationKey(key));
1637 return false;
1638 }
1639 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1640 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1641 infoln(fileTestName);
1642 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1643 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1644 infoln(prevFileLine);
1645 infoln(fileLine);
1646 infoln(printCollationKey(prevKey));
1647 infoln(printCollationKey(key));
1648 return false;
1649 }
1650 // Test NUL-termination if the strings do not contain NUL characters.
1651 if(!containNUL) {
1652 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1653 if(order != expectedUTF8Order || errorCode.isFailure()) {
1654 infoln(fileTestName);
1655 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1656 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1657 infoln(prevFileLine);
1658 infoln(fileLine);
1659 infoln(printCollationKey(prevKey));
1660 infoln(printCollationKey(key));
1661 return false;
1662 }
1663 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1664 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1665 infoln(fileTestName);
1666 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1667 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1668 infoln(prevFileLine);
1669 infoln(fileLine);
1670 infoln(printCollationKey(prevKey));
1671 infoln(printCollationKey(key));
1672 return false;
1673 }
1674 }
1675
1676 UCharIterator leftIter;
1677 UCharIterator rightIter;
1678 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1679 uiter_setString(&rightIter, s.getBuffer(), s.length());
1680 order = coll->compare(leftIter, rightIter, errorCode);
1681 if(order != expectedOrder || errorCode.isFailure()) {
1682 infoln(fileTestName);
1683 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1684 "wrong order: %d != %d (%s)",
1685 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1686 infoln(prevFileLine);
1687 infoln(fileLine);
1688 infoln(printCollationKey(prevKey));
1689 infoln(printCollationKey(key));
1690 return false;
1691 }
1692
1693 order = prevKey.compareTo(key, errorCode);
1694 if(order != expectedOrder || errorCode.isFailure()) {
1695 infoln(fileTestName);
1696 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1697 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1698 infoln(prevFileLine);
1699 infoln(fileLine);
1700 infoln(printCollationKey(prevKey));
1701 infoln(printCollationKey(key));
1702 return false;
1703 }
1704 UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1705 int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1706 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1707 if(level != expectedLevel) {
1708 infoln(fileTestName);
1709 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1710 (int)fileLineNumber, norm, order, level, expectedLevel);
1711 infoln(prevFileLine);
1712 infoln(fileLine);
1713 infoln(printCollationKey(prevKey));
1714 infoln(printCollationKey(key));
1715 return false;
1716 }
1717 }
1718
1719 // If either string contains U+FFFE, then their sort keys must compare the same as
1720 // the merged sort keys of each string's between-FFFE segments.
1721 //
1722 // It is not required that
1723 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1724 // only that those two methods yield the same order.
1725 //
1726 // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1727 if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1728 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1729 errorCode.isFailure()) {
1730 order = prevKey.compareTo(key, errorCode);
1731 if(order != expectedOrder || errorCode.isFailure()) {
1732 infoln(fileTestName);
1733 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1734 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1735 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1736 infoln(prevFileLine);
1737 infoln(fileLine);
1738 infoln(printCollationKey(prevKey));
1739 infoln(printCollationKey(key));
1740 return false;
1741 }
1742 int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1743 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1744 if(mergedLevel != level) {
1745 infoln(fileTestName);
1746 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1747 "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1748 (int)fileLineNumber, norm, order, mergedLevel, level);
1749 infoln(prevFileLine);
1750 infoln(fileLine);
1751 infoln(printCollationKey(prevKey));
1752 infoln(printCollationKey(key));
1753 return false;
1754 }
1755 }
1756 }
1757 return true;
1758 }
1759
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1760 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1761 if(errorCode.isFailure()) { return; }
1762 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1763 UnicodeString prevString, s;
1764 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1765 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1766 // Parse the line even if it will be ignored (when we do not have a Collator)
1767 // in order to report syntax issues.
1768 Collation::Level relation = parseRelationAndString(s, errorCode);
1769 if(errorCode.isFailure()) {
1770 errorCode.reset();
1771 break;
1772 }
1773 if(coll == NULL) {
1774 // We were unable to create the Collator but continue with tests.
1775 // Ignore test data for this Collator.
1776 // The next Collator creation might work.
1777 continue;
1778 }
1779 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1780 Collation::Level expectedLevel = relation;
1781 s.getTerminatedBuffer(); // Ensure NUL-termination.
1782 UBool isOk = true;
1783 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1784 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1785 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1786 expectedOrder, expectedLevel, errorCode);
1787 }
1788 if(isOk) {
1789 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1790 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1791 expectedOrder, expectedLevel, errorCode);
1792 }
1793 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1794 UnicodeString pn = nfd->normalize(prevString, errorCode);
1795 UnicodeString n = nfd->normalize(s, errorCode);
1796 pn.getTerminatedBuffer();
1797 n.getTerminatedBuffer();
1798 errorCode.assertSuccess();
1799 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1800 expectedOrder, expectedLevel, errorCode);
1801 }
1802 if(!isOk) {
1803 errorCode.reset(); // already reported
1804 }
1805 prevFileLine = fileLine;
1806 prevString = s;
1807 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1808 }
1809 }
1810
TestDataDriven()1811 void CollationTest::TestDataDriven() {
1812 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1813
1814 fcd = Normalizer2Factory::getFCDInstance(errorCode);
1815 nfd = Normalizer2::getNFDInstance(errorCode);
1816 if(errorCode.errDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1817 return;
1818 }
1819
1820 CharString path(getSourceTestData(errorCode), errorCode);
1821 path.appendPathPart("collationtest.txt", errorCode);
1822 const char *codePage = "UTF-8";
1823 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, true, false, errorCode));
1824 if(errorCode.errIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1825 return;
1826 }
1827 // Read a new line if necessary.
1828 // Sub-parsers leave the first line set that they do not handle.
1829 while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1830 if(!isSectionStarter(fileLine[0])) {
1831 errln("syntax error on line %d", (int)fileLineNumber);
1832 infoln(fileLine);
1833 return;
1834 }
1835 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1836 fileTestName = fileLine;
1837 logln(fileLine);
1838 fileLine.remove();
1839 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1840 setRootCollator(errorCode);
1841 fileLine.remove();
1842 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1843 setLocaleCollator(errorCode);
1844 fileLine.remove();
1845 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1846 buildTailoring(f.getAlias(), errorCode);
1847 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1848 parseAndSetAttribute(errorCode);
1849 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1850 checkCompareStrings(f.getAlias(), errorCode);
1851 } else {
1852 errln("syntax error on line %d", (int)fileLineNumber);
1853 infoln(fileLine);
1854 return;
1855 }
1856 }
1857 }
1858
TestLongLocale()1859 void CollationTest::TestLongLocale() {
1860 IcuTestErrorCode errorCode(*this, "TestLongLocale");
1861 Locale longLocale("sie__1G_C_CEIE_CEZCX_CSUE_E_EIESZNI2_GB_LM_LMCSUE_LMCSX_"
1862 "LVARIANT_MMCSIE_STEU_SU1GCEIE_SU6G_SU6SU6G_U_UBGE_UC_"
1863 "UCEZCSI_UCIE_UZSIU_VARIANT_X@collation=bcs-ukvsz");
1864 LocalPointer<Collator> coll(Collator::createInstance(longLocale, errorCode));
1865 }
1866
TestBuilderContextsOverflow()1867 void CollationTest::TestBuilderContextsOverflow() {
1868 IcuTestErrorCode errorCode(*this, "TestBuilderContextsOverflow");
1869 // ICU-20715: Bad memory access in what looks like a bogus CharsTrie after
1870 // intermediate contextual-mappings data overflowed.
1871 // Caused by the CollationDataBuilder using some outdated values when building
1872 // contextual mappings with both prefix and contraction matching.
1873 // Fixed by resetting those outdated values before code looks at them.
1874 char16_t rules[] = {
1875 u'&', 0x10, 0x2ff, 0x503c, 0x4617,
1876 u'=', 0x80, 0x4f7f, 0xff, 0x3c3d, 0x1c4f, 0x3c3c,
1877 u'<', 0, 0, 0, 0, u'|', 0, 0, 0, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f, 0xff,
1878 u'=', 0, u'|', 0, 0, 0, 0, 0, 0, 0x1f00, 0xe30,
1879 0x3035, 0, 0, 0xd200, 0, 0x7f00, 0xff4f, 0x3d00, 0, 0x7c00,
1880 0, 0, 0, 0, 0, 0, 0, 0x301f, 0x350e, 0x30,
1881 0, 0, 0xd2, 0x7c00, 0, 0, 0, 0, 0, 0,
1882 0, 0x301f, 0x350e, 0x30, 0, 0, 0x52d2, 0x2f3c, 0x5552, 0x493c,
1883 0x1f10, 0x1f50, 0x300, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f,
1884 0xff,
1885 u'=', 0, u'|', 0, 0, 0, 0, 0x5000, 0x4617,
1886 u'=', 0x80, 0x4f7f, 0, 0, 0xd200, 0
1887 };
1888 UnicodeString s(false, rules, UPRV_LENGTHOF(rules));
1889 LocalPointer<Collator> coll(new RuleBasedCollator(s, errorCode), errorCode);
1890 if(errorCode.isSuccess()) {
1891 logln("successfully built the Collator");
1892 }
1893 }
1894
1895 #endif // !UCONFIG_NO_COLLATION
1896