1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/tblcoll.h"
26 #include "unicode/uiter.h"
27 #include "unicode/uniset.h"
28 #include "unicode/unistr.h"
29 #include "unicode/usetiter.h"
30 #include "unicode/ustring.h"
31 #include "charstr.h"
32 #include "cmemory.h"
33 #include "collation.h"
34 #include "collationdata.h"
35 #include "collationfcd.h"
36 #include "collationiterator.h"
37 #include "collationroot.h"
38 #include "collationrootelements.h"
39 #include "collationruleparser.h"
40 #include "collationweights.h"
41 #include "cstring.h"
42 #include "intltest.h"
43 #include "normalizer2impl.h"
44 #include "ucbuf.h"
45 #include "uhash.h"
46 #include "uitercollationiterator.h"
47 #include "utf16collationiterator.h"
48 #include "utf8collationiterator.h"
49 #include "uvectr32.h"
50 #include "uvectr64.h"
51 #include "writesrc.h"
52
53 class CodePointIterator;
54
55 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
56
57 class CollationTest : public IntlTest {
58 public:
CollationTest()59 CollationTest()
60 : fcd(NULL), nfd(NULL),
61 fileLineNumber(0),
62 coll(NULL) {}
63
~CollationTest()64 ~CollationTest() {
65 delete coll;
66 }
67
68 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
69
70 void TestMinMax();
71 void TestImplicits();
72 void TestNulTerminated();
73 void TestIllegalUTF8();
74 void TestShortFCDData();
75 void TestFCD();
76 void TestCollationWeights();
77 void TestRootElements();
78 void TestTailoredElements();
79 void TestDataDriven();
80
81 private:
82 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
83 void checkAllocWeights(CollationWeights &cw,
84 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
85 int32_t someLength, int32_t minCount);
86
87 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
88 static UnicodeString printCollationKey(const CollationKey &key);
89
90 // Helpers & fields for data-driven test.
isCROrLF(UChar c)91 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)92 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)93 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
skipSpaces(int32_t i)94 int32_t skipSpaces(int32_t i) {
95 while(isSpace(fileLine[i])) { ++i; }
96 return i;
97 }
98
99 UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
100 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
101 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
102 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
103 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
104 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
105 void setRootCollator(IcuTestErrorCode &errorCode);
106 void setLocaleCollator(IcuTestErrorCode &errorCode);
107
108 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
109
110 UBool getSortKeyParts(const UChar *s, int32_t length,
111 CharString &dest, int32_t partSize,
112 IcuTestErrorCode &errorCode);
113 UBool getCollationKey(const char *norm, const UnicodeString &line,
114 const UChar *s, int32_t length,
115 CollationKey &key, IcuTestErrorCode &errorCode);
116 UBool getMergedCollationKey(const UChar *s, int32_t length,
117 CollationKey &key, IcuTestErrorCode &errorCode);
118 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
119 const UnicodeString &prevString, const UnicodeString &s,
120 UCollationResult expectedOrder, Collation::Level expectedLevel,
121 IcuTestErrorCode &errorCode);
122 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
123
124 const Normalizer2 *fcd, *nfd;
125 UnicodeString fileLine;
126 int32_t fileLineNumber;
127 UnicodeString fileTestName;
128 Collator *coll;
129 };
130
createCollationTest()131 extern IntlTest *createCollationTest() {
132 return new CollationTest();
133 }
134
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)135 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
136 if(exec) {
137 logln("TestSuite CollationTest: ");
138 }
139 TESTCASE_AUTO_BEGIN;
140 TESTCASE_AUTO(TestMinMax);
141 TESTCASE_AUTO(TestImplicits);
142 TESTCASE_AUTO(TestNulTerminated);
143 TESTCASE_AUTO(TestIllegalUTF8);
144 TESTCASE_AUTO(TestShortFCDData);
145 TESTCASE_AUTO(TestFCD);
146 TESTCASE_AUTO(TestCollationWeights);
147 TESTCASE_AUTO(TestRootElements);
148 TESTCASE_AUTO(TestTailoredElements);
149 TESTCASE_AUTO(TestDataDriven);
150 TESTCASE_AUTO_END;
151 }
152
TestMinMax()153 void CollationTest::TestMinMax() {
154 IcuTestErrorCode errorCode(*this, "TestMinMax");
155
156 setRootCollator(errorCode);
157 if(errorCode.isFailure()) {
158 errorCode.reset();
159 return;
160 }
161 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
162 if(rbc == NULL) {
163 errln("the root collator is not a RuleBasedCollator");
164 return;
165 }
166
167 static const UChar s[2] = { 0xfffe, 0xffff };
168 UVector64 ces(errorCode);
169 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
170 errorCode.assertSuccess();
171 if(ces.size() != 2) {
172 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
173 return;
174 }
175 int64_t ce = ces.elementAti(0);
176 int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
177 if(ce != expected) {
178 errln("CE(U+fffe)=%04lx != 02..", (long)ce);
179 }
180
181 ce = ces.elementAti(1);
182 expected = Collation::makeCE(Collation::MAX_PRIMARY);
183 if(ce != expected) {
184 errln("CE(U+ffff)=%04lx != max..", (long)ce);
185 }
186 }
187
TestImplicits()188 void CollationTest::TestImplicits() {
189 IcuTestErrorCode errorCode(*this, "TestImplicits");
190
191 const CollationData *cd = CollationRoot::getData(errorCode);
192 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
193 return;
194 }
195
196 // Implicit primary weights should be assigned for the following sets,
197 // and sort in ascending order by set and then code point.
198 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
199
200 // core Han Unified Ideographs
201 UnicodeSet coreHan("[\\p{unified_ideograph}&"
202 "[\\p{Block=CJK_Unified_Ideographs}"
203 "\\p{Block=CJK_Compatibility_Ideographs}]]",
204 errorCode);
205 // all other Unified Han ideographs
206 UnicodeSet otherHan("[\\p{unified ideograph}-"
207 "[\\p{Block=CJK_Unified_Ideographs}"
208 "\\p{Block=CJK_Compatibility_Ideographs}]]",
209 errorCode);
210 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
211 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
212
213 // Starting with CLDR 26/ICU 54, the root Han order may instead be
214 // the Unihan radical-stroke order.
215 // The tests should pass either way, so we only test the order of a small set of Han characters
216 // whose radical-stroke order is the same as their code point order.
217 UnicodeSet someHanInCPOrder(
218 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
219 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
220 errorCode);
221 UnicodeSet inOrder(someHanInCPOrder);
222 inOrder.addAll(unassigned).freeze();
223 if(errorCode.logIfFailureAndReset("UnicodeSet")) {
224 return;
225 }
226 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
227 UChar32 prev = 0;
228 uint32_t prevPrimary = 0;
229 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
230 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
231 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
232 while(iter->next()) {
233 UChar32 c = iter->getCodepoint();
234 UnicodeString s(c);
235 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
236 int64_t ce = ci.nextCE(errorCode);
237 int64_t ce2 = ci.nextCE(errorCode);
238 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
239 return;
240 }
241 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
242 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
243 continue;
244 }
245 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
246 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
247 (long)c, (long)(ce & 0xffffffff));
248 continue;
249 }
250 uint32_t primary = (uint32_t)(ce >> 32);
251 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
252 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
253 (long)c, (long)primary, (long)prev, (long)prevPrimary);
254 }
255 prev = c;
256 prevPrimary = primary;
257 }
258 }
259 }
260
TestNulTerminated()261 void CollationTest::TestNulTerminated() {
262 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
263 const CollationData *data = CollationRoot::getData(errorCode);
264 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
265 return;
266 }
267
268 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
269
270 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
271 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
272 for(int32_t i = 0;; ++i) {
273 int64_t ce1 = ci1.nextCE(errorCode);
274 int64_t ce2 = ci2.nextCE(errorCode);
275 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
276 return;
277 }
278 if(ce1 != ce2) {
279 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
280 break;
281 }
282 if(ce1 == Collation::NO_CE) { break; }
283 }
284 }
285
TestIllegalUTF8()286 void CollationTest::TestIllegalUTF8() {
287 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
288
289 setRootCollator(errorCode);
290 if(errorCode.isFailure()) {
291 errorCode.reset();
292 return;
293 }
294 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
295
296 static const char *strings[] = {
297 // U+FFFD
298 "a\xef\xbf\xbdz",
299 // illegal byte sequences
300 "a\x80z", // trail byte
301 "a\xc1\x81z", // non-shortest form
302 "a\xe0\x82\x83z", // non-shortest form
303 "a\xed\xa0\x80z", // lead surrogate: would be U+D800
304 "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
305 "a\xf0\x8f\xbf\xbfz", // non-shortest form
306 "a\xf4\x90\x80\x80z" // out of range: would be U+110000
307 };
308
309 StringPiece fffd(strings[0]);
310 for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
311 StringPiece illegal(strings[i]);
312 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
313 if(order != UCOL_EQUAL) {
314 errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
315 (int)i, order);
316 }
317 }
318 }
319
320 namespace {
321
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)322 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
323 for(UChar32 c = 0x10000; c < 0x110000;) {
324 UChar32 next = c + 0x400;
325 if(src.containsSome(c, next - 1)) {
326 dest.add(U16_LEAD(c));
327 }
328 c = next;
329 }
330 }
331
332 } // namespace
333
TestShortFCDData()334 void CollationTest::TestShortFCDData() {
335 // See CollationFCD class comments.
336 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
337 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
338 errorCode.assertSuccess();
339 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
340 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
341 UnicodeSet lccc; // actual
342 for(UChar32 c = 0; c <= 0xffff; ++c) {
343 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
344 }
345 UnicodeSet diff(expectedLccc);
346 diff.removeAll(lccc);
347 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
348 UnicodeString empty("[]");
349 UnicodeString diffString;
350 diff.toPattern(diffString, TRUE);
351 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
352 diff = lccc;
353 diff.removeAll(expectedLccc);
354 diff.toPattern(diffString, TRUE);
355 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
356
357 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
358 if (errorCode.isSuccess()) {
359 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
360 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
361 UnicodeSet tccc; // actual
362 for(UChar32 c = 0; c <= 0xffff; ++c) {
363 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
364 }
365 diff = expectedTccc;
366 diff.removeAll(tccc);
367 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
368 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
369 diff = tccc;
370 diff.removeAll(expectedTccc);
371 diff.toPattern(diffString, TRUE);
372 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
373 }
374 }
375
376 class CodePointIterator {
377 public:
CodePointIterator(const UChar32 * cp,int32_t length)378 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()379 void resetToStart() { pos = 0; }
next()380 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()381 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const382 int32_t getLength() const { return length; }
getIndex() const383 int getIndex() const { return (int)pos; }
384 private:
385 const UChar32 *cp;
386 int32_t length;
387 int32_t pos;
388 };
389
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)390 void CollationTest::checkFCD(const char *name,
391 CollationIterator &ci, CodePointIterator &cpi) {
392 IcuTestErrorCode errorCode(*this, "checkFCD");
393
394 // Iterate forward to the limit.
395 for(;;) {
396 UChar32 c1 = ci.nextCodePoint(errorCode);
397 UChar32 c2 = cpi.next();
398 if(c1 != c2) {
399 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
400 name, (long)c1, (long)c2, cpi.getIndex());
401 return;
402 }
403 if(c1 < 0) { break; }
404 }
405
406 // Iterate backward most of the way.
407 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
408 UChar32 c1 = ci.previousCodePoint(errorCode);
409 UChar32 c2 = cpi.previous();
410 if(c1 != c2) {
411 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
412 name, (long)c1, (long)c2, cpi.getIndex());
413 return;
414 }
415 }
416
417 // Forward again.
418 for(;;) {
419 UChar32 c1 = ci.nextCodePoint(errorCode);
420 UChar32 c2 = cpi.next();
421 if(c1 != c2) {
422 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
423 name, (long)c1, (long)c2, cpi.getIndex());
424 return;
425 }
426 if(c1 < 0) { break; }
427 }
428
429 // Iterate backward to the start.
430 for(;;) {
431 UChar32 c1 = ci.previousCodePoint(errorCode);
432 UChar32 c2 = cpi.previous();
433 if(c1 != c2) {
434 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
435 name, (long)c1, (long)c2, cpi.getIndex());
436 return;
437 }
438 if(c1 < 0) { break; }
439 }
440 }
441
TestFCD()442 void CollationTest::TestFCD() {
443 IcuTestErrorCode errorCode(*this, "TestFCD");
444 const CollationData *data = CollationRoot::getData(errorCode);
445 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
446 return;
447 }
448
449 // Input string, not FCD, NUL-terminated.
450 static const UChar s[] = {
451 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
452 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
453 0x327, 0x308, // ccc=202, 230
454 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
455 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
456 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
457 0xac01,
458 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
459 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
460 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
461 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
462 0x4e00, 0xf81,
463 0
464 };
465 // Expected code points.
466 static const UChar32 cp[] = {
467 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
468 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
469 0x1D15F, 0x1D16D,
470 0xac01,
471 0x63, 0x327, 0x1D165, 0x1D16D,
472 0x61,
473 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
474 0x4e00, 0xf71, 0xf80
475 };
476
477 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
478 if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
479 return;
480 }
481 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
482 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
483
484 #if U_HAVE_STD_STRING
485 cpi.resetToStart();
486 std::string utf8;
487 UnicodeString(s).toUTF8String(utf8);
488 FCDUTF8CollationIterator u8ci(data, FALSE,
489 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
490 if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
491 return;
492 }
493 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
494 #endif
495
496 cpi.resetToStart();
497 UCharIterator iter;
498 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the terminating NUL
499 FCDUIterCollationIterator uici(data, FALSE, iter, 0);
500 if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
501 return;
502 }
503 checkFCD("FCDUIterCollationIterator", uici, cpi);
504 }
505
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)506 void CollationTest::checkAllocWeights(CollationWeights &cw,
507 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
508 int32_t someLength, int32_t minCount) {
509 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
510 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
511 (long)lowerLimit, (long)upperLimit, (long)n);
512 return;
513 }
514 uint32_t previous = lowerLimit;
515 int32_t count = 0; // number of weights that have someLength
516 for(int32_t i = 0; i < n; ++i) {
517 uint32_t w = cw.nextWeight();
518 if(w == 0xffffffff) {
519 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
520 "returns only %ld weights",
521 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
522 return;
523 }
524 if(!(previous < w && w < upperLimit)) {
525 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
526 "number %ld -> %lx not between %lx and %lx",
527 (long)lowerLimit, (long)upperLimit, (long)n,
528 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
529 return;
530 }
531 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
532 }
533 if(count < minCount) {
534 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
535 "returns only %ld < %ld weights of length %d",
536 (long)lowerLimit, (long)upperLimit, (long)n,
537 (long)count, (long)minCount, (int)someLength);
538 }
539 }
540
TestCollationWeights()541 void CollationTest::TestCollationWeights() {
542 CollationWeights cw;
543
544 // Non-compressible primaries use 254 second bytes 02..FF.
545 logln("CollationWeights.initForPrimary(non-compressible)");
546 cw.initForPrimary(FALSE);
547 // Expect 1 weight 11 and 254 weights 12xx.
548 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
549 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
550 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
551 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
552 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
553 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
554 // Expect 254^2=64516 three-byte weights.
555 // During computation, there should be 3 three-byte ranges
556 // 10ffff, 11xxxx, 120202.
557 // The middle one should be split 64515:1,
558 // and the newly-split-off range and the last ranged lengthened.
559 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
560 // Expect weights 1102 & 1103.
561 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
562 // Expect weights 102102 & 102103.
563 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
564
565 // Compressible primaries use 251 second bytes 04..FE.
566 logln("CollationWeights.initForPrimary(compressible)");
567 cw.initForPrimary(TRUE);
568 // Expect 1 weight 11 and 251 weights 12xx.
569 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
570 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
571 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
572 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
573 // Expect weights 1104 & 1105.
574 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
575 // Expect weights 102102 & 102103.
576 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
577
578 // Secondary and tertiary weights use only bytes 3 & 4.
579 logln("CollationWeights.initForSecondary()");
580 cw.initForSecondary();
581 // Expect weights fbxx and all four fc..ff.
582 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
583
584 logln("CollationWeights.initForTertiary()");
585 cw.initForTertiary();
586 // Expect weights 3dxx and both 3e & 3f.
587 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
588 }
589
590 namespace {
591
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)592 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
593 uint32_t p, uint32_t s, uint32_t ctq) {
594 uint32_t p1 = p >> 24;
595 uint32_t p2 = (p >> 16) & 0xff;
596 uint32_t p3 = (p >> 8) & 0xff;
597 uint32_t p4 = p & 0xff;
598 uint32_t s1 = s >> 8;
599 uint32_t s2 = s & 0xff;
600 // ctq = Case, Tertiary, Quaternary
601 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
602 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
603 uint32_t t1 = t >> 8;
604 uint32_t t2 = t & 0xff;
605 uint32_t q = ctq & Collation::QUATERNARY_MASK;
606 // No leading zero bytes.
607 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
608 return FALSE;
609 }
610 // No intermediate zero bytes.
611 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
612 return FALSE;
613 }
614 if(p2 != 0 && p3 == 0 && p4 != 0) {
615 return FALSE;
616 }
617 // Minimum & maximum lead bytes.
618 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
619 s1 == Collation::LEVEL_SEPARATOR_BYTE ||
620 t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
621 return FALSE;
622 }
623 if(c > 2) {
624 return FALSE;
625 }
626 // The valid byte range for the second primary byte depends on compressibility.
627 if(p2 != 0) {
628 if(data.isCompressibleLeadByte(p1)) {
629 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
630 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
631 return FALSE;
632 }
633 } else {
634 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
635 return FALSE;
636 }
637 }
638 }
639 // Other bytes just need to avoid the level separator.
640 // Trailing zeros are ok.
641 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
642 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
643 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
644 return FALSE;
645 }
646 // Well-formed CEs.
647 if(p == 0) {
648 if(s == 0) {
649 if(t == 0) {
650 // Completely ignorable CE.
651 // Quaternary CEs are not supported.
652 if(c != 0 || q != 0) {
653 return FALSE;
654 }
655 } else {
656 // Tertiary CE.
657 if(t < re.getTertiaryBoundary() || c != 2) {
658 return FALSE;
659 }
660 }
661 } else {
662 // Secondary CE.
663 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
664 return FALSE;
665 }
666 }
667 } else {
668 // Primary CE.
669 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
670 s >= re.getSecondaryBoundary()) {
671 return FALSE;
672 }
673 if(t == 0 || t >= re.getTertiaryBoundary()) {
674 return FALSE;
675 }
676 }
677 return TRUE;
678 }
679
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)680 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
681 uint32_t p = (uint32_t)(ce >> 32);
682 uint32_t secTer = (uint32_t)ce;
683 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
684 }
685
686 class RootElementsIterator {
687 public:
RootElementsIterator(const CollationData & root)688 RootElementsIterator(const CollationData &root)
689 : data(root),
690 elements(root.rootElements), length(root.rootElementsLength),
691 pri(0), secTer(0),
692 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
693
next()694 UBool next() {
695 if(index >= length) { return FALSE; }
696 uint32_t p = elements[index];
697 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
698 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
699 ++index;
700 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
701 return TRUE;
702 }
703 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
704 // End of a range, enumerate the primaries in the range.
705 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
706 p &= 0xffffff00;
707 if(pri == p) {
708 // Finished the range, return the next CE after it.
709 ++index;
710 return next();
711 }
712 U_ASSERT(pri < p);
713 // Return the next primary in this range.
714 UBool isCompressible = data.isCompressiblePrimary(pri);
715 if((pri & 0xffff) == 0) {
716 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
717 } else {
718 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
719 }
720 return TRUE;
721 }
722 // Simple primary CE.
723 ++index;
724 pri = p;
725 // Does this have an explicit below-common sec/ter unit,
726 // or does it imply a common one?
727 if(index == length) {
728 secTer = Collation::COMMON_SEC_AND_TER_CE;
729 } else {
730 secTer = elements[index];
731 if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
732 // No sec/ter delta.
733 secTer = Collation::COMMON_SEC_AND_TER_CE;
734 } else {
735 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
736 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
737 // Implied sec/ter.
738 secTer = Collation::COMMON_SEC_AND_TER_CE;
739 } else {
740 // Explicit sec/ter below common/common.
741 ++index;
742 }
743 }
744 }
745 return TRUE;
746 }
747
getPrimary() const748 uint32_t getPrimary() const { return pri; }
getSecTer() const749 uint32_t getSecTer() const { return secTer; }
750
751 private:
752 const CollationData &data;
753 const uint32_t *elements;
754 int32_t length;
755
756 uint32_t pri;
757 uint32_t secTer;
758 int32_t index;
759 };
760
761 } // namespace
762
TestRootElements()763 void CollationTest::TestRootElements() {
764 IcuTestErrorCode errorCode(*this, "TestRootElements");
765 const CollationData *root = CollationRoot::getData(errorCode);
766 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
767 return;
768 }
769 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
770 RootElementsIterator iter(*root);
771
772 // We check each root CE for validity,
773 // and we also verify that there is a tailoring gap between each two CEs.
774 CollationWeights cw1c; // compressible primary weights
775 CollationWeights cw1u; // uncompressible primary weights
776 CollationWeights cw2;
777 CollationWeights cw3;
778
779 cw1c.initForPrimary(TRUE);
780 cw1u.initForPrimary(FALSE);
781 cw2.initForSecondary();
782 cw3.initForTertiary();
783
784 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
785 // nor the special merge-separator CE for U+FFFE.
786 uint32_t prevPri = 0;
787 uint32_t prevSec = 0;
788 uint32_t prevTer = 0;
789 while(iter.next()) {
790 uint32_t pri = iter.getPrimary();
791 uint32_t secTer = iter.getSecTer();
792 // CollationRootElements CEs must have 0 case and quaternary bits.
793 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
794 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
795 (long)pri, (long)secTer);
796 }
797 uint32_t sec = secTer >> 16;
798 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
799 uint32_t ctq = ter;
800 if(pri == 0 && sec == 0 && ter != 0) {
801 // Tertiary CEs must have uppercase bits,
802 // but they are not stored in the CollationRootElements.
803 ctq |= 0x8000;
804 }
805 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
806 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
807 } else {
808 if(pri != prevPri) {
809 uint32_t newWeight = 0;
810 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
811 // There is currently no tailoring gap after primary ignorables,
812 // and we forbid tailoring after U+FFFD and U+FFFF.
813 } else if(root->isCompressiblePrimary(prevPri)) {
814 if(!cw1c.allocWeights(prevPri, pri, 1)) {
815 errln("no primary/compressible tailoring gap between %08lx and %08lx",
816 (long)prevPri, (long)pri);
817 } else {
818 newWeight = cw1c.nextWeight();
819 }
820 } else {
821 if(!cw1u.allocWeights(prevPri, pri, 1)) {
822 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
823 (long)prevPri, (long)pri);
824 } else {
825 newWeight = cw1u.nextWeight();
826 }
827 }
828 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
829 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
830 (long)prevPri, (long)newWeight, (long)pri);
831 }
832 } else if(sec != prevSec) {
833 uint32_t lowerLimit =
834 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
835 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
836 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
837 } else {
838 uint32_t newWeight = cw2.nextWeight();
839 if(!(prevSec < newWeight && newWeight < sec)) {
840 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
841 (long)lowerLimit, (long)newWeight, (long)sec);
842 }
843 }
844 } else if(ter != prevTer) {
845 uint32_t lowerLimit =
846 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
847 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
848 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
849 } else {
850 uint32_t newWeight = cw3.nextWeight();
851 if(!(prevTer < newWeight && newWeight < ter)) {
852 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
853 (long)lowerLimit, (long)newWeight, (long)ter);
854 }
855 }
856 } else {
857 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
858 }
859 }
860 prevPri = pri;
861 prevSec = sec;
862 prevTer = ter;
863 }
864 }
865
TestTailoredElements()866 void CollationTest::TestTailoredElements() {
867 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
868 const CollationData *root = CollationRoot::getData(errorCode);
869 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
870 return;
871 }
872 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
873
874 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
875 if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
876 return;
877 }
878 uhash_setKeyDeleter(prevLocales, uprv_free);
879 // TestRootElements() tests the root collator which does not have tailorings.
880 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
881 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
882 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
883
884 UVector64 ces(errorCode);
885 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
886 U_ASSERT(locales.isValid());
887 const char *localeID = "root";
888 do {
889 Locale locale(localeID);
890 LocalPointer<StringEnumeration> types(
891 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
892 errorCode.assertSuccess();
893 const char *type; // first: default type
894 while((type = types->next(NULL, errorCode)) != NULL) {
895 if(strncmp(type, "private-", 8) == 0) {
896 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
897 localeID, type);
898 }
899 Locale localeWithType(locale);
900 localeWithType.setKeywordValue("collation", type, errorCode);
901 errorCode.assertSuccess();
902 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
903 if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
904 localeWithType.getName())) {
905 continue;
906 }
907 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
908 if(uhash_geti(prevLocales, actual.getName()) != 0) {
909 continue;
910 }
911 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
912 errorCode.assertSuccess();
913 logln("TestTailoredElements(): requested %s -> actual %s",
914 localeWithType.getName(), actual.getName());
915 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
916 if(rbc == NULL) {
917 continue;
918 }
919 // Note: It would be better to get tailored strings such that we can
920 // identify the prefix, and only get the CEs for the prefix+string,
921 // not also for the prefix.
922 // There is currently no API for that.
923 // It would help in an unusual case where a contraction starting in the prefix
924 // extends past its end, and we do not see the intended mapping.
925 // For example, for a mapping p|st, if there is also a contraction ps,
926 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
927 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
928 errorCode.assertSuccess();
929 UnicodeSetIterator iter(*tailored);
930 while(iter.next()) {
931 const UnicodeString &s = iter.getString();
932 ces.removeAllElements();
933 rbc->internalGetCEs(s, ces, errorCode);
934 errorCode.assertSuccess();
935 for(int32_t i = 0; i < ces.size(); ++i) {
936 int64_t ce = ces.elementAti(i);
937 if(!isValidCE(rootElements, *root, ce)) {
938 errln("invalid tailored CE %016llx at CE index %d from string:",
939 (long long)ce, (int)i);
940 infoln(prettify(s));
941 }
942 }
943 }
944 }
945 } while((localeID = locales->next(NULL, errorCode)) != NULL);
946 uhash_close(prevLocales);
947 }
948
printSortKey(const uint8_t * p,int32_t length)949 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
950 UnicodeString s;
951 for(int32_t i = 0; i < length; ++i) {
952 if(i > 0) { s.append((UChar)0x20); }
953 uint8_t b = p[i];
954 if(b == 0) {
955 s.append((UChar)0x2e); // period
956 } else if(b == 1) {
957 s.append((UChar)0x7c); // vertical bar
958 } else {
959 appendHex(b, 2, s);
960 }
961 }
962 return s;
963 }
964
printCollationKey(const CollationKey & key)965 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
966 int32_t length;
967 const uint8_t *p = key.getByteArray(length);
968 return printSortKey(p, length);
969 }
970
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)971 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
972 for(;;) {
973 int32_t lineLength;
974 const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
975 if(line == NULL || errorCode.isFailure()) {
976 fileLine.remove();
977 return FALSE;
978 }
979 ++fileLineNumber;
980 // Strip trailing CR/LF, comments, and spaces.
981 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
982 if(comment != NULL) {
983 lineLength = (int32_t)(comment - line);
984 } else {
985 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
986 }
987 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
988 if(lineLength != 0) {
989 fileLine.setTo(FALSE, line, lineLength);
990 return TRUE;
991 }
992 // Empty line, continue.
993 }
994 }
995
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)996 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
997 UErrorCode &errorCode) {
998 int32_t length = fileLine.length();
999 int32_t i;
1000 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
1001 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|'
1002 if(pipeIndex >= 0) {
1003 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1004 if(prefix.isEmpty()) {
1005 errln("empty prefix on line %d", (int)fileLineNumber);
1006 infoln(fileLine);
1007 errorCode = U_PARSE_ERROR;
1008 return;
1009 }
1010 start = pipeIndex + 1;
1011 } else {
1012 prefix.remove();
1013 }
1014 s = fileLine.tempSubStringBetween(start, i).unescape();
1015 if(s.isEmpty()) {
1016 errln("empty string on line %d", (int)fileLineNumber);
1017 infoln(fileLine);
1018 errorCode = U_PARSE_ERROR;
1019 return;
1020 }
1021 start = i;
1022 }
1023
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1024 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1025 Collation::Level relation;
1026 int32_t start;
1027 if(fileLine[0] == 0x3c) { // <
1028 UChar second = fileLine[1];
1029 start = 2;
1030 switch(second) {
1031 case 0x31: // <1
1032 relation = Collation::PRIMARY_LEVEL;
1033 break;
1034 case 0x32: // <2
1035 relation = Collation::SECONDARY_LEVEL;
1036 break;
1037 case 0x33: // <3
1038 relation = Collation::TERTIARY_LEVEL;
1039 break;
1040 case 0x34: // <4
1041 relation = Collation::QUATERNARY_LEVEL;
1042 break;
1043 case 0x63: // <c
1044 relation = Collation::CASE_LEVEL;
1045 break;
1046 case 0x69: // <i
1047 relation = Collation::IDENTICAL_LEVEL;
1048 break;
1049 default: // just <
1050 relation = Collation::NO_LEVEL;
1051 start = 1;
1052 break;
1053 }
1054 } else if(fileLine[0] == 0x3d) { // =
1055 relation = Collation::ZERO_LEVEL;
1056 start = 1;
1057 } else {
1058 start = 0;
1059 }
1060 if(start == 0 || !isSpace(fileLine[start])) {
1061 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1062 infoln(fileLine);
1063 errorCode.set(U_PARSE_ERROR);
1064 return Collation::NO_LEVEL;
1065 }
1066 start = skipSpaces(start);
1067 UnicodeString prefix;
1068 parseString(start, prefix, s, errorCode);
1069 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1070 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1071 infoln(fileLine);
1072 errorCode.set(U_PARSE_ERROR);
1073 return Collation::NO_LEVEL;
1074 }
1075 if(start < fileLine.length()) {
1076 errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1077 infoln(fileLine);
1078 errorCode.set(U_PARSE_ERROR);
1079 return Collation::NO_LEVEL;
1080 }
1081 return relation;
1082 }
1083
1084 static const struct {
1085 const char *name;
1086 UColAttribute attr;
1087 } attributes[] = {
1088 { "backwards", UCOL_FRENCH_COLLATION },
1089 { "alternate", UCOL_ALTERNATE_HANDLING },
1090 { "caseFirst", UCOL_CASE_FIRST },
1091 { "caseLevel", UCOL_CASE_LEVEL },
1092 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1093 { "strength", UCOL_STRENGTH },
1094 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1095 { "numeric", UCOL_NUMERIC_COLLATION }
1096 };
1097
1098 static const struct {
1099 const char *name;
1100 UColAttributeValue value;
1101 } attributeValues[] = {
1102 { "default", UCOL_DEFAULT },
1103 { "primary", UCOL_PRIMARY },
1104 { "secondary", UCOL_SECONDARY },
1105 { "tertiary", UCOL_TERTIARY },
1106 { "quaternary", UCOL_QUATERNARY },
1107 { "identical", UCOL_IDENTICAL },
1108 { "off", UCOL_OFF },
1109 { "on", UCOL_ON },
1110 { "shifted", UCOL_SHIFTED },
1111 { "non-ignorable", UCOL_NON_IGNORABLE },
1112 { "lower", UCOL_LOWER_FIRST },
1113 { "upper", UCOL_UPPER_FIRST }
1114 };
1115
parseAndSetAttribute(IcuTestErrorCode & errorCode)1116 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1117 // Parse attributes even if the Collator could not be created,
1118 // in order to report syntax errors.
1119 int32_t start = skipSpaces(1);
1120 int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1121 if(equalPos < 0) {
1122 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1123 parseAndSetReorderCodes(start + 7, errorCode);
1124 return;
1125 }
1126 errln("missing '=' on line %d", (int)fileLineNumber);
1127 infoln(fileLine);
1128 errorCode.set(U_PARSE_ERROR);
1129 return;
1130 }
1131
1132 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1133 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1134 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1135 UColReorderCode max;
1136 if(valueString == UNICODE_STRING("space", 5)) {
1137 max = UCOL_REORDER_CODE_SPACE;
1138 } else if(valueString == UNICODE_STRING("punct", 5)) {
1139 max = UCOL_REORDER_CODE_PUNCTUATION;
1140 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1141 max = UCOL_REORDER_CODE_SYMBOL;
1142 } else if(valueString == UNICODE_STRING("currency", 8)) {
1143 max = UCOL_REORDER_CODE_CURRENCY;
1144 } else {
1145 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1146 infoln(fileLine);
1147 errorCode.set(U_PARSE_ERROR);
1148 return;
1149 }
1150 if(coll != NULL) {
1151 coll->setMaxVariable(max, errorCode);
1152 if(errorCode.isFailure()) {
1153 errln("setMaxVariable() failed on line %d: %s",
1154 (int)fileLineNumber, errorCode.errorName());
1155 infoln(fileLine);
1156 return;
1157 }
1158 }
1159 fileLine.remove();
1160 return;
1161 }
1162
1163 UColAttribute attr;
1164 for(int32_t i = 0;; ++i) {
1165 if(i == UPRV_LENGTHOF(attributes)) {
1166 errln("invalid attribute name on line %d", (int)fileLineNumber);
1167 infoln(fileLine);
1168 errorCode.set(U_PARSE_ERROR);
1169 return;
1170 }
1171 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1172 attr = attributes[i].attr;
1173 break;
1174 }
1175 }
1176
1177 UColAttributeValue value;
1178 for(int32_t i = 0;; ++i) {
1179 if(i == UPRV_LENGTHOF(attributeValues)) {
1180 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1181 infoln(fileLine);
1182 errorCode.set(U_PARSE_ERROR);
1183 return;
1184 }
1185 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1186 value = attributeValues[i].value;
1187 break;
1188 }
1189 }
1190
1191 if(coll != NULL) {
1192 coll->setAttribute(attr, value, errorCode);
1193 if(errorCode.isFailure()) {
1194 errln("illegal attribute=value combination on line %d: %s",
1195 (int)fileLineNumber, errorCode.errorName());
1196 infoln(fileLine);
1197 return;
1198 }
1199 }
1200 fileLine.remove();
1201 }
1202
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1203 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1204 UVector32 reorderCodes(errorCode);
1205 while(start < fileLine.length()) {
1206 start = skipSpaces(start);
1207 int32_t limit = start;
1208 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1209 CharString name;
1210 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1211 int32_t code = CollationRuleParser::getReorderCode(name.data());
1212 if(code < 0) {
1213 if(uprv_stricmp(name.data(), "default") == 0) {
1214 code = UCOL_REORDER_CODE_DEFAULT; // -1
1215 } else {
1216 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1217 infoln(fileLine);
1218 errorCode.set(U_PARSE_ERROR);
1219 return;
1220 }
1221 }
1222 reorderCodes.addElement(code, errorCode);
1223 start = limit;
1224 }
1225 if(coll != NULL) {
1226 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1227 if(errorCode.isFailure()) {
1228 errln("setReorderCodes() failed on line %d: %s",
1229 (int)fileLineNumber, errorCode.errorName());
1230 infoln(fileLine);
1231 return;
1232 }
1233 }
1234 fileLine.remove();
1235 }
1236
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1237 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1238 UnicodeString rules;
1239 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1240 rules.append(fileLine.unescape());
1241 }
1242 if(errorCode.isFailure()) { return; }
1243 logln(rules);
1244
1245 UParseError parseError;
1246 UnicodeString reason;
1247 delete coll;
1248 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1249 if(coll == NULL) {
1250 errln("unable to allocate a new collator");
1251 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1252 return;
1253 }
1254 if(errorCode.isFailure()) {
1255 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1256 infoln(UnicodeString(" reason: ") + reason);
1257 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); }
1258 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1259 infoln(UnicodeString(" snippet: ...") +
1260 parseError.preContext + "(!)" + parseError.postContext + "...");
1261 }
1262 delete coll;
1263 coll = NULL;
1264 errorCode.reset();
1265 } else {
1266 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1267 UnicodeString(), reason);
1268 }
1269 }
1270
setRootCollator(IcuTestErrorCode & errorCode)1271 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1272 if(errorCode.isFailure()) { return; }
1273 delete coll;
1274 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1275 if(errorCode.isFailure()) {
1276 dataerrln("unable to create a root collator");
1277 return;
1278 }
1279 }
1280
setLocaleCollator(IcuTestErrorCode & errorCode)1281 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1282 if(errorCode.isFailure()) { return; }
1283 delete coll;
1284 coll = NULL;
1285 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant
1286 if(at >= 0) {
1287 fileLine.setCharAt(at, (UChar)0x2a); // *
1288 }
1289 CharString localeID;
1290 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1291 if(at >= 0) {
1292 localeID.data()[at - 9] = '@';
1293 }
1294 Locale locale(localeID.data());
1295 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1296 errln("invalid language tag on line %d", (int)fileLineNumber);
1297 infoln(fileLine);
1298 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1299 return;
1300 }
1301
1302 logln("creating a collator for locale ID %s", locale.getName());
1303 coll = Collator::createInstance(locale, errorCode);
1304 if(errorCode.isFailure()) {
1305 dataerrln("unable to create a collator for locale %s on line %d",
1306 locale.getName(), (int)fileLineNumber);
1307 infoln(fileLine);
1308 delete coll;
1309 coll = NULL;
1310 errorCode.reset();
1311 }
1312 }
1313
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1314 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1315 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1316 // In some sequences with Tibetan composite vowel signs,
1317 // even if the string passes the FCD check,
1318 // those composites must be decomposed.
1319 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1320 int32_t index = 0;
1321 while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1322 if(++index < s.length()) {
1323 UChar c = s[index];
1324 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1325 }
1326 }
1327 return FALSE;
1328 }
1329
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1330 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1331 CharString &dest, int32_t partSize,
1332 IcuTestErrorCode &errorCode) {
1333 if(errorCode.isFailure()) { return FALSE; }
1334 uint8_t part[32];
1335 U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1336 UCharIterator iter;
1337 uiter_setString(&iter, s, length);
1338 uint32_t state[2] = { 0, 0 };
1339 for(;;) {
1340 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1341 UBool done = partLength < partSize;
1342 if(done) {
1343 // At the end, append the next byte as well which should be 00.
1344 ++partLength;
1345 }
1346 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1347 if(done) {
1348 return errorCode.isSuccess();
1349 }
1350 }
1351 }
1352
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1353 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1354 const UChar *s, int32_t length,
1355 CollationKey &key, IcuTestErrorCode &errorCode) {
1356 if(errorCode.isFailure()) { return FALSE; }
1357 coll->getCollationKey(s, length, key, errorCode);
1358 if(errorCode.isFailure()) {
1359 infoln(fileTestName);
1360 errln("Collator(%s).getCollationKey() failed: %s",
1361 norm, errorCode.errorName());
1362 infoln(line);
1363 return FALSE;
1364 }
1365 int32_t keyLength;
1366 const uint8_t *keyBytes = key.getByteArray(keyLength);
1367 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1368 infoln(fileTestName);
1369 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1370 norm);
1371 infoln(line);
1372 infoln(printCollationKey(key));
1373 return FALSE;
1374 }
1375
1376 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1377 if(numLevels < UCOL_IDENTICAL) {
1378 ++numLevels;
1379 } else {
1380 numLevels = 5;
1381 }
1382 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1383 ++numLevels;
1384 }
1385 errorCode.assertSuccess();
1386 int32_t numLevelSeparators = 0;
1387 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1388 uint8_t b = keyBytes[i];
1389 if(b == 0) {
1390 infoln(fileTestName);
1391 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1392 infoln(line);
1393 infoln(printCollationKey(key));
1394 return FALSE;
1395 }
1396 if(b == 1) { ++numLevelSeparators; }
1397 }
1398 if(numLevelSeparators != (numLevels - 1)) {
1399 infoln(fileTestName);
1400 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1401 norm, (int)numLevelSeparators, (int)numLevels);
1402 infoln(line);
1403 infoln(printCollationKey(key));
1404 return FALSE;
1405 }
1406
1407 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1408 static const int32_t partSizes[] = { 32, 3, 1 };
1409 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1410 int32_t partSize = partSizes[psi];
1411 CharString parts;
1412 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1413 infoln(fileTestName);
1414 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1415 norm, (int)partSize, errorCode.errorName());
1416 infoln(line);
1417 return FALSE;
1418 }
1419 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1420 infoln(fileTestName);
1421 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1422 norm, (int)partSize);
1423 infoln(line);
1424 infoln(printCollationKey(key));
1425 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1426 return FALSE;
1427 }
1428 }
1429 return TRUE;
1430 }
1431
1432 /**
1433 * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1434 * Leaves key unchanged if s does not contain U+FFFE.
1435 * @return TRUE if the key was successfully changed
1436 */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1437 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1438 CollationKey &key, IcuTestErrorCode &errorCode) {
1439 if(errorCode.isFailure()) { return FALSE; }
1440 LocalMemory<uint8_t> mergedKey;
1441 int32_t mergedKeyLength = 0;
1442 int32_t mergedKeyCapacity = 0;
1443 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1444 int32_t segmentStart = 0;
1445 for(int32_t i = 0;;) {
1446 if(i == sLength) {
1447 if(segmentStart == 0) {
1448 // s does not contain any U+FFFE.
1449 return FALSE;
1450 }
1451 } else if(s[i] != 0xfffe) {
1452 ++i;
1453 continue;
1454 }
1455 // Get the sort key for another segment and merge it into mergedKey.
1456 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1457 CollationKey key2;
1458 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1459 int32_t key1Length, key2Length;
1460 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1461 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1462 uint8_t *dest;
1463 int32_t minCapacity = key1Length + key2Length;
1464 if(key1Length > 0) { --minCapacity; }
1465 if(minCapacity <= mergedKeyCapacity) {
1466 dest = mergedKey.getAlias();
1467 } else {
1468 if(minCapacity <= 200) {
1469 mergedKeyCapacity = 200;
1470 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1471 mergedKeyCapacity *= 2;
1472 } else {
1473 mergedKeyCapacity = minCapacity;
1474 }
1475 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1476 }
1477 U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1478 if(key1Length == 0) {
1479 // key2 is the sort key for the first segment.
1480 uprv_memcpy(dest, key2Bytes, key2Length);
1481 mergedKeyLength = key2Length;
1482 } else {
1483 mergedKeyLength =
1484 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1485 dest, mergedKeyCapacity);
1486 }
1487 if(i == sLength) { break; }
1488 segmentStart = ++i;
1489 }
1490 key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1491 return TRUE;
1492 }
1493
1494 namespace {
1495
1496 /**
1497 * Replaces unpaired surrogates with U+FFFD.
1498 * Returns s if no replacement was made, otherwise buffer.
1499 */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1500 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1501 int32_t i = 0;
1502 while(i < s.length()) {
1503 UChar32 c = s.char32At(i);
1504 if(U_IS_SURROGATE(c)) {
1505 if(buffer.length() < i) {
1506 buffer.append(s, buffer.length(), i - buffer.length());
1507 }
1508 buffer.append((UChar)0xfffd);
1509 }
1510 i += U16_LENGTH(c);
1511 }
1512 if(buffer.isEmpty()) {
1513 return s;
1514 }
1515 if(buffer.length() < i) {
1516 buffer.append(s, buffer.length(), i - buffer.length());
1517 }
1518 return buffer;
1519 }
1520
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1521 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1522 UCollationResult order, UBool collHasCaseLevel) {
1523 if(order == UCOL_EQUAL) {
1524 return Collation::NO_LEVEL;
1525 }
1526 int32_t prevKeyLength;
1527 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1528 int32_t keyLength;
1529 const uint8_t *bytes = key.getByteArray(keyLength);
1530 int32_t level = Collation::PRIMARY_LEVEL;
1531 for(int32_t i = 0;; ++i) {
1532 uint8_t b = prevBytes[i];
1533 if(b != bytes[i]) { break; }
1534 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1535 ++level;
1536 if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1537 ++level;
1538 }
1539 }
1540 }
1541 return level;
1542 }
1543
1544 }
1545
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1546 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1547 const UnicodeString &prevString, const UnicodeString &s,
1548 UCollationResult expectedOrder, Collation::Level expectedLevel,
1549 IcuTestErrorCode &errorCode) {
1550 if(errorCode.isFailure()) { return FALSE; }
1551
1552 // Get the sort keys first, for error debug output.
1553 CollationKey prevKey;
1554 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1555 prevKey, errorCode)) {
1556 return FALSE;
1557 }
1558 CollationKey key;
1559 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1560
1561 UCollationResult order = coll->compare(prevString, s, errorCode);
1562 if(order != expectedOrder || errorCode.isFailure()) {
1563 infoln(fileTestName);
1564 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1565 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1566 infoln(prevFileLine);
1567 infoln(fileLine);
1568 infoln(printCollationKey(prevKey));
1569 infoln(printCollationKey(key));
1570 return FALSE;
1571 }
1572 order = coll->compare(s, prevString, errorCode);
1573 if(order != -expectedOrder || errorCode.isFailure()) {
1574 infoln(fileTestName);
1575 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1576 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1577 infoln(prevFileLine);
1578 infoln(fileLine);
1579 infoln(printCollationKey(prevKey));
1580 infoln(printCollationKey(key));
1581 return FALSE;
1582 }
1583 // Test NUL-termination if the strings do not contain NUL characters.
1584 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1585 if(!containNUL) {
1586 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1587 if(order != expectedOrder || errorCode.isFailure()) {
1588 infoln(fileTestName);
1589 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1590 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1591 infoln(prevFileLine);
1592 infoln(fileLine);
1593 infoln(printCollationKey(prevKey));
1594 infoln(printCollationKey(key));
1595 return FALSE;
1596 }
1597 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1598 if(order != -expectedOrder || errorCode.isFailure()) {
1599 infoln(fileTestName);
1600 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1601 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1602 infoln(prevFileLine);
1603 infoln(fileLine);
1604 infoln(printCollationKey(prevKey));
1605 infoln(printCollationKey(key));
1606 return FALSE;
1607 }
1608 }
1609
1610 #if U_HAVE_STD_STRING
1611 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1612 // Unpaired surrogates cannot be converted to UTF-8.
1613 // Create valid UTF-16 strings if necessary, and use those for
1614 // both the expected compare() result and for the input to compare(UTF-8).
1615 UnicodeString prevBuffer, sBuffer;
1616 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1617 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1618 std::string prevUTF8, sUTF8;
1619 UnicodeString(prevValid).toUTF8String(prevUTF8);
1620 UnicodeString(sValid).toUTF8String(sUTF8);
1621 UCollationResult expectedUTF8Order;
1622 if(&prevValid == &prevString && &sValid == &s) {
1623 expectedUTF8Order = expectedOrder;
1624 } else {
1625 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1626 }
1627
1628 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1629 if(order != expectedUTF8Order || errorCode.isFailure()) {
1630 infoln(fileTestName);
1631 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1632 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1633 infoln(prevFileLine);
1634 infoln(fileLine);
1635 infoln(printCollationKey(prevKey));
1636 infoln(printCollationKey(key));
1637 return FALSE;
1638 }
1639 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1640 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1641 infoln(fileTestName);
1642 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1643 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1644 infoln(prevFileLine);
1645 infoln(fileLine);
1646 infoln(printCollationKey(prevKey));
1647 infoln(printCollationKey(key));
1648 return FALSE;
1649 }
1650 // Test NUL-termination if the strings do not contain NUL characters.
1651 if(!containNUL) {
1652 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1653 if(order != expectedUTF8Order || errorCode.isFailure()) {
1654 infoln(fileTestName);
1655 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1656 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1657 infoln(prevFileLine);
1658 infoln(fileLine);
1659 infoln(printCollationKey(prevKey));
1660 infoln(printCollationKey(key));
1661 return FALSE;
1662 }
1663 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1664 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1665 infoln(fileTestName);
1666 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1667 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1668 infoln(prevFileLine);
1669 infoln(fileLine);
1670 infoln(printCollationKey(prevKey));
1671 infoln(printCollationKey(key));
1672 return FALSE;
1673 }
1674 }
1675 #endif
1676
1677 UCharIterator leftIter;
1678 UCharIterator rightIter;
1679 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1680 uiter_setString(&rightIter, s.getBuffer(), s.length());
1681 order = coll->compare(leftIter, rightIter, errorCode);
1682 if(order != expectedOrder || errorCode.isFailure()) {
1683 infoln(fileTestName);
1684 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1685 "wrong order: %d != %d (%s)",
1686 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1687 infoln(prevFileLine);
1688 infoln(fileLine);
1689 infoln(printCollationKey(prevKey));
1690 infoln(printCollationKey(key));
1691 return FALSE;
1692 }
1693
1694 order = prevKey.compareTo(key, errorCode);
1695 if(order != expectedOrder || errorCode.isFailure()) {
1696 infoln(fileTestName);
1697 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1698 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1699 infoln(prevFileLine);
1700 infoln(fileLine);
1701 infoln(printCollationKey(prevKey));
1702 infoln(printCollationKey(key));
1703 return FALSE;
1704 }
1705 UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1706 int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1707 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1708 if(level != expectedLevel) {
1709 infoln(fileTestName);
1710 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1711 (int)fileLineNumber, norm, order, level, expectedLevel);
1712 infoln(prevFileLine);
1713 infoln(fileLine);
1714 infoln(printCollationKey(prevKey));
1715 infoln(printCollationKey(key));
1716 return FALSE;
1717 }
1718 }
1719
1720 // If either string contains U+FFFE, then their sort keys must compare the same as
1721 // the merged sort keys of each string's between-FFFE segments.
1722 //
1723 // It is not required that
1724 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1725 // only that those two methods yield the same order.
1726 //
1727 // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1728 if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1729 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1730 errorCode.isFailure()) {
1731 order = prevKey.compareTo(key, errorCode);
1732 if(order != expectedOrder || errorCode.isFailure()) {
1733 infoln(fileTestName);
1734 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1735 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1736 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1737 infoln(prevFileLine);
1738 infoln(fileLine);
1739 infoln(printCollationKey(prevKey));
1740 infoln(printCollationKey(key));
1741 return FALSE;
1742 }
1743 int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1744 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1745 if(mergedLevel != level) {
1746 infoln(fileTestName);
1747 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1748 "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1749 (int)fileLineNumber, norm, order, mergedLevel, level);
1750 infoln(prevFileLine);
1751 infoln(fileLine);
1752 infoln(printCollationKey(prevKey));
1753 infoln(printCollationKey(key));
1754 return FALSE;
1755 }
1756 }
1757 }
1758 return TRUE;
1759 }
1760
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1761 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1762 if(errorCode.isFailure()) { return; }
1763 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1764 UnicodeString prevString, s;
1765 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1766 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1767 // Parse the line even if it will be ignored (when we do not have a Collator)
1768 // in order to report syntax issues.
1769 Collation::Level relation = parseRelationAndString(s, errorCode);
1770 if(errorCode.isFailure()) {
1771 errorCode.reset();
1772 break;
1773 }
1774 if(coll == NULL) {
1775 // We were unable to create the Collator but continue with tests.
1776 // Ignore test data for this Collator.
1777 // The next Collator creation might work.
1778 continue;
1779 }
1780 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1781 Collation::Level expectedLevel = relation;
1782 s.getTerminatedBuffer(); // Ensure NUL-termination.
1783 UBool isOk = TRUE;
1784 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1785 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1786 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1787 expectedOrder, expectedLevel, errorCode);
1788 }
1789 if(isOk) {
1790 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1791 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1792 expectedOrder, expectedLevel, errorCode);
1793 }
1794 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1795 UnicodeString pn = nfd->normalize(prevString, errorCode);
1796 UnicodeString n = nfd->normalize(s, errorCode);
1797 pn.getTerminatedBuffer();
1798 n.getTerminatedBuffer();
1799 errorCode.assertSuccess();
1800 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1801 expectedOrder, expectedLevel, errorCode);
1802 }
1803 if(!isOk) {
1804 errorCode.reset(); // already reported
1805 }
1806 prevFileLine = fileLine;
1807 prevString = s;
1808 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1809 }
1810 }
1811
TestDataDriven()1812 void CollationTest::TestDataDriven() {
1813 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1814
1815 fcd = Normalizer2Factory::getFCDInstance(errorCode);
1816 nfd = Normalizer2::getNFDInstance(errorCode);
1817 if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1818 return;
1819 }
1820
1821 CharString path(getSourceTestData(errorCode), errorCode);
1822 path.appendPathPart("collationtest.txt", errorCode);
1823 const char *codePage = "UTF-8";
1824 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1825 if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1826 return;
1827 }
1828 // Read a new line if necessary.
1829 // Sub-parsers leave the first line set that they do not handle.
1830 while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1831 if(!isSectionStarter(fileLine[0])) {
1832 errln("syntax error on line %d", (int)fileLineNumber);
1833 infoln(fileLine);
1834 return;
1835 }
1836 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1837 fileTestName = fileLine;
1838 logln(fileLine);
1839 fileLine.remove();
1840 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1841 setRootCollator(errorCode);
1842 fileLine.remove();
1843 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1844 setLocaleCollator(errorCode);
1845 fileLine.remove();
1846 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1847 buildTailoring(f.getAlias(), errorCode);
1848 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1849 parseAndSetAttribute(errorCode);
1850 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1851 checkCompareStrings(f.getAlias(), errorCode);
1852 } else {
1853 errln("syntax error on line %d", (int)fileLineNumber);
1854 infoln(fileLine);
1855 return;
1856 }
1857 }
1858 }
1859
1860 #endif // !UCONFIG_NO_COLLATION
1861