1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/stringpiece.h"
26 #include "unicode/tblcoll.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uniset.h"
29 #include "unicode/unistr.h"
30 #include "unicode/usetiter.h"
31 #include "unicode/ustring.h"
32 #include "charstr.h"
33 #include "cmemory.h"
34 #include "collation.h"
35 #include "collationdata.h"
36 #include "collationfcd.h"
37 #include "collationiterator.h"
38 #include "collationroot.h"
39 #include "collationrootelements.h"
40 #include "collationruleparser.h"
41 #include "collationweights.h"
42 #include "cstring.h"
43 #include "intltest.h"
44 #include "normalizer2impl.h"
45 #include "ucbuf.h"
46 #include "uhash.h"
47 #include "uitercollationiterator.h"
48 #include "utf16collationiterator.h"
49 #include "utf8collationiterator.h"
50 #include "uvectr32.h"
51 #include "uvectr64.h"
52 #include "writesrc.h"
53
54 class CodePointIterator;
55
56 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
57
58 class CollationTest : public IntlTest {
59 public:
CollationTest()60 CollationTest()
61 : fcd(NULL), nfd(NULL),
62 fileLineNumber(0),
63 coll(NULL) {}
64
~CollationTest()65 ~CollationTest() {
66 delete coll;
67 }
68
69 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
70
71 void TestMinMax();
72 void TestImplicits();
73 void TestNulTerminated();
74 void TestIllegalUTF8();
75 void TestShortFCDData();
76 void TestFCD();
77 void TestCollationWeights();
78 void TestRootElements();
79 void TestTailoredElements();
80 void TestDataDriven();
81
82 private:
83 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
84 void checkAllocWeights(CollationWeights &cw,
85 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
86 int32_t someLength, int32_t minCount);
87
88 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
89 static UnicodeString printCollationKey(const CollationKey &key);
90
91 // Helpers & fields for data-driven test.
isCROrLF(UChar c)92 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)93 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)94 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
skipSpaces(int32_t i)95 int32_t skipSpaces(int32_t i) {
96 while(isSpace(fileLine[i])) { ++i; }
97 return i;
98 }
99
100 UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
101 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
102 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
103 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
104 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
105 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
106 void setRootCollator(IcuTestErrorCode &errorCode);
107 void setLocaleCollator(IcuTestErrorCode &errorCode);
108
109 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
110
111 UBool getSortKeyParts(const UChar *s, int32_t length,
112 CharString &dest, int32_t partSize,
113 IcuTestErrorCode &errorCode);
114 UBool getCollationKey(const char *norm, const UnicodeString &line,
115 const UChar *s, int32_t length,
116 CollationKey &key, IcuTestErrorCode &errorCode);
117 UBool getMergedCollationKey(const UChar *s, int32_t length,
118 CollationKey &key, IcuTestErrorCode &errorCode);
119 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
120 const UnicodeString &prevString, const UnicodeString &s,
121 UCollationResult expectedOrder, Collation::Level expectedLevel,
122 IcuTestErrorCode &errorCode);
123 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
124
125 const Normalizer2 *fcd, *nfd;
126 UnicodeString fileLine;
127 int32_t fileLineNumber;
128 UnicodeString fileTestName;
129 Collator *coll;
130 };
131
createCollationTest()132 extern IntlTest *createCollationTest() {
133 return new CollationTest();
134 }
135
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)136 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
137 if(exec) {
138 logln("TestSuite CollationTest: ");
139 }
140 TESTCASE_AUTO_BEGIN;
141 TESTCASE_AUTO(TestMinMax);
142 TESTCASE_AUTO(TestImplicits);
143 TESTCASE_AUTO(TestNulTerminated);
144 TESTCASE_AUTO(TestIllegalUTF8);
145 TESTCASE_AUTO(TestShortFCDData);
146 TESTCASE_AUTO(TestFCD);
147 TESTCASE_AUTO(TestCollationWeights);
148 TESTCASE_AUTO(TestRootElements);
149 TESTCASE_AUTO(TestTailoredElements);
150 TESTCASE_AUTO(TestDataDriven);
151 TESTCASE_AUTO_END;
152 }
153
TestMinMax()154 void CollationTest::TestMinMax() {
155 IcuTestErrorCode errorCode(*this, "TestMinMax");
156
157 setRootCollator(errorCode);
158 if(errorCode.isFailure()) {
159 errorCode.reset();
160 return;
161 }
162 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
163 if(rbc == NULL) {
164 errln("the root collator is not a RuleBasedCollator");
165 return;
166 }
167
168 static const UChar s[2] = { 0xfffe, 0xffff };
169 UVector64 ces(errorCode);
170 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
171 errorCode.assertSuccess();
172 if(ces.size() != 2) {
173 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
174 return;
175 }
176 int64_t ce = ces.elementAti(0);
177 int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
178 if(ce != expected) {
179 errln("CE(U+fffe)=%04lx != 02..", (long)ce);
180 }
181
182 ce = ces.elementAti(1);
183 expected = Collation::makeCE(Collation::MAX_PRIMARY);
184 if(ce != expected) {
185 errln("CE(U+ffff)=%04lx != max..", (long)ce);
186 }
187 }
188
TestImplicits()189 void CollationTest::TestImplicits() {
190 IcuTestErrorCode errorCode(*this, "TestImplicits");
191
192 const CollationData *cd = CollationRoot::getData(errorCode);
193 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
194 return;
195 }
196
197 // Implicit primary weights should be assigned for the following sets,
198 // and sort in ascending order by set and then code point.
199 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
200
201 // core Han Unified Ideographs
202 UnicodeSet coreHan("[\\p{unified_ideograph}&"
203 "[\\p{Block=CJK_Unified_Ideographs}"
204 "\\p{Block=CJK_Compatibility_Ideographs}]]",
205 errorCode);
206 // all other Unified Han ideographs
207 UnicodeSet otherHan("[\\p{unified ideograph}-"
208 "[\\p{Block=CJK_Unified_Ideographs}"
209 "\\p{Block=CJK_Compatibility_Ideographs}]]",
210 errorCode);
211 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
212 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
213
214 // Starting with CLDR 26/ICU 54, the root Han order may instead be
215 // the Unihan radical-stroke order.
216 // The tests should pass either way, so we only test the order of a small set of Han characters
217 // whose radical-stroke order is the same as their code point order.
218 UnicodeSet someHanInCPOrder(
219 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
220 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
221 errorCode);
222 UnicodeSet inOrder(someHanInCPOrder);
223 inOrder.addAll(unassigned).freeze();
224 if(errorCode.errIfFailureAndReset("UnicodeSet")) {
225 return;
226 }
227 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
228 UChar32 prev = 0;
229 uint32_t prevPrimary = 0;
230 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
231 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
232 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
233 while(iter->next()) {
234 UChar32 c = iter->getCodepoint();
235 UnicodeString s(c);
236 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
237 int64_t ce = ci.nextCE(errorCode);
238 int64_t ce2 = ci.nextCE(errorCode);
239 if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
240 return;
241 }
242 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
243 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
244 continue;
245 }
246 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
247 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
248 (long)c, (long)(ce & 0xffffffff));
249 continue;
250 }
251 uint32_t primary = (uint32_t)(ce >> 32);
252 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
253 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
254 (long)c, (long)primary, (long)prev, (long)prevPrimary);
255 }
256 prev = c;
257 prevPrimary = primary;
258 }
259 }
260 }
261
TestNulTerminated()262 void CollationTest::TestNulTerminated() {
263 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
264 const CollationData *data = CollationRoot::getData(errorCode);
265 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
266 return;
267 }
268
269 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
270
271 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
272 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
273 for(int32_t i = 0;; ++i) {
274 int64_t ce1 = ci1.nextCE(errorCode);
275 int64_t ce2 = ci2.nextCE(errorCode);
276 if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
277 return;
278 }
279 if(ce1 != ce2) {
280 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
281 break;
282 }
283 if(ce1 == Collation::NO_CE) { break; }
284 }
285 }
286
TestIllegalUTF8()287 void CollationTest::TestIllegalUTF8() {
288 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
289
290 setRootCollator(errorCode);
291 if(errorCode.isFailure()) {
292 errorCode.reset();
293 return;
294 }
295 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
296
297 static const StringPiece strings[] = {
298 // string with U+FFFD == illegal byte sequence
299 u8"a\uFFFDz", "a\x80z", // trail byte
300 u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form
301 u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form
302 u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800
303 u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
304 u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form
305 u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000
306 };
307
308 for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
309 StringPiece fffd(strings[i]);
310 StringPiece illegal(strings[i + 1]);
311 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
312 if(order != UCOL_EQUAL) {
313 errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
314 (int)i, order);
315 }
316 }
317 }
318
319 namespace {
320
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)321 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
322 for(UChar32 c = 0x10000; c < 0x110000;) {
323 UChar32 next = c + 0x400;
324 if(src.containsSome(c, next - 1)) {
325 dest.add(U16_LEAD(c));
326 }
327 c = next;
328 }
329 }
330
331 } // namespace
332
TestShortFCDData()333 void CollationTest::TestShortFCDData() {
334 // See CollationFCD class comments.
335 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
336 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
337 errorCode.assertSuccess();
338 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
339 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
340 UnicodeSet lccc; // actual
341 for(UChar32 c = 0; c <= 0xffff; ++c) {
342 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
343 }
344 UnicodeSet diff(expectedLccc);
345 diff.removeAll(lccc);
346 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
347 UnicodeString empty("[]");
348 UnicodeString diffString;
349 diff.toPattern(diffString, TRUE);
350 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
351 diff = lccc;
352 diff.removeAll(expectedLccc);
353 diff.toPattern(diffString, TRUE);
354 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
355
356 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
357 if (errorCode.isSuccess()) {
358 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
359 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
360 UnicodeSet tccc; // actual
361 for(UChar32 c = 0; c <= 0xffff; ++c) {
362 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
363 }
364 diff = expectedTccc;
365 diff.removeAll(tccc);
366 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
367 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
368 diff = tccc;
369 diff.removeAll(expectedTccc);
370 diff.toPattern(diffString, TRUE);
371 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
372 }
373 }
374
375 class CodePointIterator {
376 public:
CodePointIterator(const UChar32 * cp,int32_t length)377 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()378 void resetToStart() { pos = 0; }
next()379 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()380 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const381 int32_t getLength() const { return length; }
getIndex() const382 int getIndex() const { return (int)pos; }
383 private:
384 const UChar32 *cp;
385 int32_t length;
386 int32_t pos;
387 };
388
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)389 void CollationTest::checkFCD(const char *name,
390 CollationIterator &ci, CodePointIterator &cpi) {
391 IcuTestErrorCode errorCode(*this, "checkFCD");
392
393 // Iterate forward to the limit.
394 for(;;) {
395 UChar32 c1 = ci.nextCodePoint(errorCode);
396 UChar32 c2 = cpi.next();
397 if(c1 != c2) {
398 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
399 name, (long)c1, (long)c2, cpi.getIndex());
400 return;
401 }
402 if(c1 < 0) { break; }
403 }
404
405 // Iterate backward most of the way.
406 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
407 UChar32 c1 = ci.previousCodePoint(errorCode);
408 UChar32 c2 = cpi.previous();
409 if(c1 != c2) {
410 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
411 name, (long)c1, (long)c2, cpi.getIndex());
412 return;
413 }
414 }
415
416 // Forward again.
417 for(;;) {
418 UChar32 c1 = ci.nextCodePoint(errorCode);
419 UChar32 c2 = cpi.next();
420 if(c1 != c2) {
421 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
422 name, (long)c1, (long)c2, cpi.getIndex());
423 return;
424 }
425 if(c1 < 0) { break; }
426 }
427
428 // Iterate backward to the start.
429 for(;;) {
430 UChar32 c1 = ci.previousCodePoint(errorCode);
431 UChar32 c2 = cpi.previous();
432 if(c1 != c2) {
433 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
434 name, (long)c1, (long)c2, cpi.getIndex());
435 return;
436 }
437 if(c1 < 0) { break; }
438 }
439 }
440
TestFCD()441 void CollationTest::TestFCD() {
442 IcuTestErrorCode errorCode(*this, "TestFCD");
443 const CollationData *data = CollationRoot::getData(errorCode);
444 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
445 return;
446 }
447
448 // Input string, not FCD, NUL-terminated.
449 static const UChar s[] = {
450 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
451 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
452 0x327, 0x308, // ccc=202, 230
453 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
454 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
455 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
456 0xac01,
457 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
458 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
459 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
460 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
461 0x4e00, 0xf81,
462 0
463 };
464 // Expected code points.
465 static const UChar32 cp[] = {
466 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
467 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
468 0x1D15F, 0x1D16D,
469 0xac01,
470 0x63, 0x327, 0x1D165, 0x1D16D,
471 0x61,
472 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
473 0x4e00, 0xf71, 0xf80
474 };
475
476 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
477 if(errorCode.errIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
478 return;
479 }
480 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
481 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
482
483 cpi.resetToStart();
484 std::string utf8;
485 UnicodeString(s).toUTF8String(utf8);
486 FCDUTF8CollationIterator u8ci(data, FALSE,
487 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
488 if(errorCode.errIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
489 return;
490 }
491 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
492
493 cpi.resetToStart();
494 UCharIterator iter;
495 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the terminating NUL
496 FCDUIterCollationIterator uici(data, FALSE, iter, 0);
497 if(errorCode.errIfFailureAndReset("FCDUIterCollationIterator constructor")) {
498 return;
499 }
500 checkFCD("FCDUIterCollationIterator", uici, cpi);
501 }
502
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)503 void CollationTest::checkAllocWeights(CollationWeights &cw,
504 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
505 int32_t someLength, int32_t minCount) {
506 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
507 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
508 (long)lowerLimit, (long)upperLimit, (long)n);
509 return;
510 }
511 uint32_t previous = lowerLimit;
512 int32_t count = 0; // number of weights that have someLength
513 for(int32_t i = 0; i < n; ++i) {
514 uint32_t w = cw.nextWeight();
515 if(w == 0xffffffff) {
516 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
517 "returns only %ld weights",
518 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
519 return;
520 }
521 if(!(previous < w && w < upperLimit)) {
522 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
523 "number %ld -> %lx not between %lx and %lx",
524 (long)lowerLimit, (long)upperLimit, (long)n,
525 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
526 return;
527 }
528 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
529 }
530 if(count < minCount) {
531 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
532 "returns only %ld < %ld weights of length %d",
533 (long)lowerLimit, (long)upperLimit, (long)n,
534 (long)count, (long)minCount, (int)someLength);
535 }
536 }
537
TestCollationWeights()538 void CollationTest::TestCollationWeights() {
539 CollationWeights cw;
540
541 // Non-compressible primaries use 254 second bytes 02..FF.
542 logln("CollationWeights.initForPrimary(non-compressible)");
543 cw.initForPrimary(FALSE);
544 // Expect 1 weight 11 and 254 weights 12xx.
545 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
546 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
547 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
548 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
549 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
550 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
551 // Expect 254^2=64516 three-byte weights.
552 // During computation, there should be 3 three-byte ranges
553 // 10ffff, 11xxxx, 120202.
554 // The middle one should be split 64515:1,
555 // and the newly-split-off range and the last ranged lengthened.
556 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
557 // Expect weights 1102 & 1103.
558 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
559 // Expect weights 102102 & 102103.
560 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
561
562 // Compressible primaries use 251 second bytes 04..FE.
563 logln("CollationWeights.initForPrimary(compressible)");
564 cw.initForPrimary(TRUE);
565 // Expect 1 weight 11 and 251 weights 12xx.
566 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
567 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
568 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
569 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
570 // Expect weights 1104 & 1105.
571 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
572 // Expect weights 102102 & 102103.
573 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
574
575 // Secondary and tertiary weights use only bytes 3 & 4.
576 logln("CollationWeights.initForSecondary()");
577 cw.initForSecondary();
578 // Expect weights fbxx and all four fc..ff.
579 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
580
581 logln("CollationWeights.initForTertiary()");
582 cw.initForTertiary();
583 // Expect weights 3dxx and both 3e & 3f.
584 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
585 }
586
587 namespace {
588
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)589 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
590 uint32_t p, uint32_t s, uint32_t ctq) {
591 uint32_t p1 = p >> 24;
592 uint32_t p2 = (p >> 16) & 0xff;
593 uint32_t p3 = (p >> 8) & 0xff;
594 uint32_t p4 = p & 0xff;
595 uint32_t s1 = s >> 8;
596 uint32_t s2 = s & 0xff;
597 // ctq = Case, Tertiary, Quaternary
598 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
599 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
600 uint32_t t1 = t >> 8;
601 uint32_t t2 = t & 0xff;
602 uint32_t q = ctq & Collation::QUATERNARY_MASK;
603 // No leading zero bytes.
604 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
605 return FALSE;
606 }
607 // No intermediate zero bytes.
608 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
609 return FALSE;
610 }
611 if(p2 != 0 && p3 == 0 && p4 != 0) {
612 return FALSE;
613 }
614 // Minimum & maximum lead bytes.
615 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
616 s1 == Collation::LEVEL_SEPARATOR_BYTE ||
617 t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
618 return FALSE;
619 }
620 if(c > 2) {
621 return FALSE;
622 }
623 // The valid byte range for the second primary byte depends on compressibility.
624 if(p2 != 0) {
625 if(data.isCompressibleLeadByte(p1)) {
626 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
627 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
628 return FALSE;
629 }
630 } else {
631 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
632 return FALSE;
633 }
634 }
635 }
636 // Other bytes just need to avoid the level separator.
637 // Trailing zeros are ok.
638 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
639 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
640 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
641 return FALSE;
642 }
643 // Well-formed CEs.
644 if(p == 0) {
645 if(s == 0) {
646 if(t == 0) {
647 // Completely ignorable CE.
648 // Quaternary CEs are not supported.
649 if(c != 0 || q != 0) {
650 return FALSE;
651 }
652 } else {
653 // Tertiary CE.
654 if(t < re.getTertiaryBoundary() || c != 2) {
655 return FALSE;
656 }
657 }
658 } else {
659 // Secondary CE.
660 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
661 return FALSE;
662 }
663 }
664 } else {
665 // Primary CE.
666 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
667 s >= re.getSecondaryBoundary()) {
668 return FALSE;
669 }
670 if(t == 0 || t >= re.getTertiaryBoundary()) {
671 return FALSE;
672 }
673 }
674 return TRUE;
675 }
676
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)677 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
678 uint32_t p = (uint32_t)(ce >> 32);
679 uint32_t secTer = (uint32_t)ce;
680 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
681 }
682
683 class RootElementsIterator {
684 public:
RootElementsIterator(const CollationData & root)685 RootElementsIterator(const CollationData &root)
686 : data(root),
687 elements(root.rootElements), length(root.rootElementsLength),
688 pri(0), secTer(0),
689 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
690
next()691 UBool next() {
692 if(index >= length) { return FALSE; }
693 uint32_t p = elements[index];
694 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
695 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
696 ++index;
697 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
698 return TRUE;
699 }
700 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
701 // End of a range, enumerate the primaries in the range.
702 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
703 p &= 0xffffff00;
704 if(pri == p) {
705 // Finished the range, return the next CE after it.
706 ++index;
707 return next();
708 }
709 U_ASSERT(pri < p);
710 // Return the next primary in this range.
711 UBool isCompressible = data.isCompressiblePrimary(pri);
712 if((pri & 0xffff) == 0) {
713 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
714 } else {
715 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
716 }
717 return TRUE;
718 }
719 // Simple primary CE.
720 ++index;
721 pri = p;
722 // Does this have an explicit below-common sec/ter unit,
723 // or does it imply a common one?
724 if(index == length) {
725 secTer = Collation::COMMON_SEC_AND_TER_CE;
726 } else {
727 secTer = elements[index];
728 if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
729 // No sec/ter delta.
730 secTer = Collation::COMMON_SEC_AND_TER_CE;
731 } else {
732 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
733 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
734 // Implied sec/ter.
735 secTer = Collation::COMMON_SEC_AND_TER_CE;
736 } else {
737 // Explicit sec/ter below common/common.
738 ++index;
739 }
740 }
741 }
742 return TRUE;
743 }
744
getPrimary() const745 uint32_t getPrimary() const { return pri; }
getSecTer() const746 uint32_t getSecTer() const { return secTer; }
747
748 private:
749 const CollationData &data;
750 const uint32_t *elements;
751 int32_t length;
752
753 uint32_t pri;
754 uint32_t secTer;
755 int32_t index;
756 };
757
758 } // namespace
759
TestRootElements()760 void CollationTest::TestRootElements() {
761 IcuTestErrorCode errorCode(*this, "TestRootElements");
762 const CollationData *root = CollationRoot::getData(errorCode);
763 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
764 return;
765 }
766 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
767 RootElementsIterator iter(*root);
768
769 // We check each root CE for validity,
770 // and we also verify that there is a tailoring gap between each two CEs.
771 CollationWeights cw1c; // compressible primary weights
772 CollationWeights cw1u; // uncompressible primary weights
773 CollationWeights cw2;
774 CollationWeights cw3;
775
776 cw1c.initForPrimary(TRUE);
777 cw1u.initForPrimary(FALSE);
778 cw2.initForSecondary();
779 cw3.initForTertiary();
780
781 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
782 // nor the special merge-separator CE for U+FFFE.
783 uint32_t prevPri = 0;
784 uint32_t prevSec = 0;
785 uint32_t prevTer = 0;
786 while(iter.next()) {
787 uint32_t pri = iter.getPrimary();
788 uint32_t secTer = iter.getSecTer();
789 // CollationRootElements CEs must have 0 case and quaternary bits.
790 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
791 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
792 (long)pri, (long)secTer);
793 }
794 uint32_t sec = secTer >> 16;
795 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
796 uint32_t ctq = ter;
797 if(pri == 0 && sec == 0 && ter != 0) {
798 // Tertiary CEs must have uppercase bits,
799 // but they are not stored in the CollationRootElements.
800 ctq |= 0x8000;
801 }
802 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
803 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
804 } else {
805 if(pri != prevPri) {
806 uint32_t newWeight = 0;
807 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
808 // There is currently no tailoring gap after primary ignorables,
809 // and we forbid tailoring after U+FFFD and U+FFFF.
810 } else if(root->isCompressiblePrimary(prevPri)) {
811 if(!cw1c.allocWeights(prevPri, pri, 1)) {
812 errln("no primary/compressible tailoring gap between %08lx and %08lx",
813 (long)prevPri, (long)pri);
814 } else {
815 newWeight = cw1c.nextWeight();
816 }
817 } else {
818 if(!cw1u.allocWeights(prevPri, pri, 1)) {
819 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
820 (long)prevPri, (long)pri);
821 } else {
822 newWeight = cw1u.nextWeight();
823 }
824 }
825 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
826 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
827 (long)prevPri, (long)newWeight, (long)pri);
828 }
829 } else if(sec != prevSec) {
830 uint32_t lowerLimit =
831 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
832 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
833 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
834 } else {
835 uint32_t newWeight = cw2.nextWeight();
836 if(!(prevSec < newWeight && newWeight < sec)) {
837 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
838 (long)lowerLimit, (long)newWeight, (long)sec);
839 }
840 }
841 } else if(ter != prevTer) {
842 uint32_t lowerLimit =
843 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
844 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
845 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
846 } else {
847 uint32_t newWeight = cw3.nextWeight();
848 if(!(prevTer < newWeight && newWeight < ter)) {
849 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
850 (long)lowerLimit, (long)newWeight, (long)ter);
851 }
852 }
853 } else {
854 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
855 }
856 }
857 prevPri = pri;
858 prevSec = sec;
859 prevTer = ter;
860 }
861 }
862
TestTailoredElements()863 void CollationTest::TestTailoredElements() {
864 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
865 const CollationData *root = CollationRoot::getData(errorCode);
866 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
867 return;
868 }
869 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
870
871 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
872 if(errorCode.errIfFailureAndReset("failed to create a hash table")) {
873 return;
874 }
875 uhash_setKeyDeleter(prevLocales, uprv_free);
876 // TestRootElements() tests the root collator which does not have tailorings.
877 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
878 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
879 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
880
881 UVector64 ces(errorCode);
882 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
883 U_ASSERT(locales.isValid());
884 const char *localeID = "root";
885 do {
886 Locale locale(localeID);
887 LocalPointer<StringEnumeration> types(
888 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
889 errorCode.assertSuccess();
890 const char *type; // first: default type
891 while((type = types->next(NULL, errorCode)) != NULL) {
892 if(strncmp(type, "private-", 8) == 0) {
893 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
894 localeID, type);
895 }
896 Locale localeWithType(locale);
897 localeWithType.setKeywordValue("collation", type, errorCode);
898 errorCode.assertSuccess();
899 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
900 if(errorCode.errIfFailureAndReset("Collator::createInstance(%s)",
901 localeWithType.getName())) {
902 continue;
903 }
904 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
905 if(uhash_geti(prevLocales, actual.getName()) != 0) {
906 continue;
907 }
908 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
909 errorCode.assertSuccess();
910 logln("TestTailoredElements(): requested %s -> actual %s",
911 localeWithType.getName(), actual.getName());
912 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
913 if(rbc == NULL) {
914 continue;
915 }
916 // Note: It would be better to get tailored strings such that we can
917 // identify the prefix, and only get the CEs for the prefix+string,
918 // not also for the prefix.
919 // There is currently no API for that.
920 // It would help in an unusual case where a contraction starting in the prefix
921 // extends past its end, and we do not see the intended mapping.
922 // For example, for a mapping p|st, if there is also a contraction ps,
923 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
924 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
925 errorCode.assertSuccess();
926 UnicodeSetIterator iter(*tailored);
927 while(iter.next()) {
928 const UnicodeString &s = iter.getString();
929 ces.removeAllElements();
930 rbc->internalGetCEs(s, ces, errorCode);
931 errorCode.assertSuccess();
932 for(int32_t i = 0; i < ces.size(); ++i) {
933 int64_t ce = ces.elementAti(i);
934 if(!isValidCE(rootElements, *root, ce)) {
935 errln("invalid tailored CE %016llx at CE index %d from string:",
936 (long long)ce, (int)i);
937 infoln(prettify(s));
938 }
939 }
940 }
941 }
942 } while((localeID = locales->next(NULL, errorCode)) != NULL);
943 uhash_close(prevLocales);
944 }
945
printSortKey(const uint8_t * p,int32_t length)946 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
947 UnicodeString s;
948 for(int32_t i = 0; i < length; ++i) {
949 if(i > 0) { s.append((UChar)0x20); }
950 uint8_t b = p[i];
951 if(b == 0) {
952 s.append((UChar)0x2e); // period
953 } else if(b == 1) {
954 s.append((UChar)0x7c); // vertical bar
955 } else {
956 appendHex(b, 2, s);
957 }
958 }
959 return s;
960 }
961
printCollationKey(const CollationKey & key)962 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
963 int32_t length;
964 const uint8_t *p = key.getByteArray(length);
965 return printSortKey(p, length);
966 }
967
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)968 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
969 for(;;) {
970 int32_t lineLength;
971 const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
972 if(line == NULL || errorCode.isFailure()) {
973 fileLine.remove();
974 return FALSE;
975 }
976 ++fileLineNumber;
977 // Strip trailing CR/LF, comments, and spaces.
978 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
979 if(comment != NULL) {
980 lineLength = (int32_t)(comment - line);
981 } else {
982 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
983 }
984 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
985 if(lineLength != 0) {
986 fileLine.setTo(FALSE, line, lineLength);
987 return TRUE;
988 }
989 // Empty line, continue.
990 }
991 }
992
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)993 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
994 UErrorCode &errorCode) {
995 int32_t length = fileLine.length();
996 int32_t i;
997 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
998 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|'
999 if(pipeIndex >= 0) {
1000 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1001 if(prefix.isEmpty()) {
1002 errln("empty prefix on line %d", (int)fileLineNumber);
1003 infoln(fileLine);
1004 errorCode = U_PARSE_ERROR;
1005 return;
1006 }
1007 start = pipeIndex + 1;
1008 } else {
1009 prefix.remove();
1010 }
1011 s = fileLine.tempSubStringBetween(start, i).unescape();
1012 if(s.isEmpty()) {
1013 errln("empty string on line %d", (int)fileLineNumber);
1014 infoln(fileLine);
1015 errorCode = U_PARSE_ERROR;
1016 return;
1017 }
1018 start = i;
1019 }
1020
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1021 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1022 Collation::Level relation;
1023 int32_t start;
1024 if(fileLine[0] == 0x3c) { // <
1025 UChar second = fileLine[1];
1026 start = 2;
1027 switch(second) {
1028 case 0x31: // <1
1029 relation = Collation::PRIMARY_LEVEL;
1030 break;
1031 case 0x32: // <2
1032 relation = Collation::SECONDARY_LEVEL;
1033 break;
1034 case 0x33: // <3
1035 relation = Collation::TERTIARY_LEVEL;
1036 break;
1037 case 0x34: // <4
1038 relation = Collation::QUATERNARY_LEVEL;
1039 break;
1040 case 0x63: // <c
1041 relation = Collation::CASE_LEVEL;
1042 break;
1043 case 0x69: // <i
1044 relation = Collation::IDENTICAL_LEVEL;
1045 break;
1046 default: // just <
1047 relation = Collation::NO_LEVEL;
1048 start = 1;
1049 break;
1050 }
1051 } else if(fileLine[0] == 0x3d) { // =
1052 relation = Collation::ZERO_LEVEL;
1053 start = 1;
1054 } else {
1055 start = 0;
1056 }
1057 if(start == 0 || !isSpace(fileLine[start])) {
1058 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1059 infoln(fileLine);
1060 errorCode.set(U_PARSE_ERROR);
1061 return Collation::NO_LEVEL;
1062 }
1063 start = skipSpaces(start);
1064 UnicodeString prefix;
1065 parseString(start, prefix, s, errorCode);
1066 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1067 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1068 infoln(fileLine);
1069 errorCode.set(U_PARSE_ERROR);
1070 return Collation::NO_LEVEL;
1071 }
1072 if(start < fileLine.length()) {
1073 errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1074 infoln(fileLine);
1075 errorCode.set(U_PARSE_ERROR);
1076 return Collation::NO_LEVEL;
1077 }
1078 return relation;
1079 }
1080
1081 static const struct {
1082 const char *name;
1083 UColAttribute attr;
1084 } attributes[] = {
1085 { "backwards", UCOL_FRENCH_COLLATION },
1086 { "alternate", UCOL_ALTERNATE_HANDLING },
1087 { "caseFirst", UCOL_CASE_FIRST },
1088 { "caseLevel", UCOL_CASE_LEVEL },
1089 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1090 { "strength", UCOL_STRENGTH },
1091 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1092 { "numeric", UCOL_NUMERIC_COLLATION }
1093 };
1094
1095 static const struct {
1096 const char *name;
1097 UColAttributeValue value;
1098 } attributeValues[] = {
1099 { "default", UCOL_DEFAULT },
1100 { "primary", UCOL_PRIMARY },
1101 { "secondary", UCOL_SECONDARY },
1102 { "tertiary", UCOL_TERTIARY },
1103 { "quaternary", UCOL_QUATERNARY },
1104 { "identical", UCOL_IDENTICAL },
1105 { "off", UCOL_OFF },
1106 { "on", UCOL_ON },
1107 { "shifted", UCOL_SHIFTED },
1108 { "non-ignorable", UCOL_NON_IGNORABLE },
1109 { "lower", UCOL_LOWER_FIRST },
1110 { "upper", UCOL_UPPER_FIRST }
1111 };
1112
parseAndSetAttribute(IcuTestErrorCode & errorCode)1113 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1114 // Parse attributes even if the Collator could not be created,
1115 // in order to report syntax errors.
1116 int32_t start = skipSpaces(1);
1117 int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1118 if(equalPos < 0) {
1119 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1120 parseAndSetReorderCodes(start + 7, errorCode);
1121 return;
1122 }
1123 errln("missing '=' on line %d", (int)fileLineNumber);
1124 infoln(fileLine);
1125 errorCode.set(U_PARSE_ERROR);
1126 return;
1127 }
1128
1129 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1130 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1131 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1132 UColReorderCode max;
1133 if(valueString == UNICODE_STRING("space", 5)) {
1134 max = UCOL_REORDER_CODE_SPACE;
1135 } else if(valueString == UNICODE_STRING("punct", 5)) {
1136 max = UCOL_REORDER_CODE_PUNCTUATION;
1137 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1138 max = UCOL_REORDER_CODE_SYMBOL;
1139 } else if(valueString == UNICODE_STRING("currency", 8)) {
1140 max = UCOL_REORDER_CODE_CURRENCY;
1141 } else {
1142 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1143 infoln(fileLine);
1144 errorCode.set(U_PARSE_ERROR);
1145 return;
1146 }
1147 if(coll != NULL) {
1148 coll->setMaxVariable(max, errorCode);
1149 if(errorCode.isFailure()) {
1150 errln("setMaxVariable() failed on line %d: %s",
1151 (int)fileLineNumber, errorCode.errorName());
1152 infoln(fileLine);
1153 return;
1154 }
1155 }
1156 fileLine.remove();
1157 return;
1158 }
1159
1160 UColAttribute attr;
1161 for(int32_t i = 0;; ++i) {
1162 if(i == UPRV_LENGTHOF(attributes)) {
1163 errln("invalid attribute name on line %d", (int)fileLineNumber);
1164 infoln(fileLine);
1165 errorCode.set(U_PARSE_ERROR);
1166 return;
1167 }
1168 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1169 attr = attributes[i].attr;
1170 break;
1171 }
1172 }
1173
1174 UColAttributeValue value;
1175 for(int32_t i = 0;; ++i) {
1176 if(i == UPRV_LENGTHOF(attributeValues)) {
1177 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1178 infoln(fileLine);
1179 errorCode.set(U_PARSE_ERROR);
1180 return;
1181 }
1182 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1183 value = attributeValues[i].value;
1184 break;
1185 }
1186 }
1187
1188 if(coll != NULL) {
1189 coll->setAttribute(attr, value, errorCode);
1190 if(errorCode.isFailure()) {
1191 errln("illegal attribute=value combination on line %d: %s",
1192 (int)fileLineNumber, errorCode.errorName());
1193 infoln(fileLine);
1194 return;
1195 }
1196 }
1197 fileLine.remove();
1198 }
1199
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1200 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1201 UVector32 reorderCodes(errorCode);
1202 while(start < fileLine.length()) {
1203 start = skipSpaces(start);
1204 int32_t limit = start;
1205 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1206 CharString name;
1207 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1208 int32_t code = CollationRuleParser::getReorderCode(name.data());
1209 if(code < 0) {
1210 if(uprv_stricmp(name.data(), "default") == 0) {
1211 code = UCOL_REORDER_CODE_DEFAULT; // -1
1212 } else {
1213 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1214 infoln(fileLine);
1215 errorCode.set(U_PARSE_ERROR);
1216 return;
1217 }
1218 }
1219 reorderCodes.addElement(code, errorCode);
1220 start = limit;
1221 }
1222 if(coll != NULL) {
1223 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1224 if(errorCode.isFailure()) {
1225 errln("setReorderCodes() failed on line %d: %s",
1226 (int)fileLineNumber, errorCode.errorName());
1227 infoln(fileLine);
1228 return;
1229 }
1230 }
1231 fileLine.remove();
1232 }
1233
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1234 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1235 UnicodeString rules;
1236 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1237 rules.append(fileLine.unescape());
1238 }
1239 if(errorCode.isFailure()) { return; }
1240 logln(rules);
1241
1242 UParseError parseError;
1243 UnicodeString reason;
1244 delete coll;
1245 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1246 if(coll == NULL) {
1247 errln("unable to allocate a new collator");
1248 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1249 return;
1250 }
1251 if(errorCode.isFailure()) {
1252 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1253 infoln(UnicodeString(" reason: ") + reason);
1254 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); }
1255 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1256 infoln(UnicodeString(" snippet: ...") +
1257 parseError.preContext + "(!)" + parseError.postContext + "...");
1258 }
1259 delete coll;
1260 coll = NULL;
1261 errorCode.reset();
1262 } else {
1263 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1264 UnicodeString(), reason);
1265 }
1266 }
1267
setRootCollator(IcuTestErrorCode & errorCode)1268 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1269 if(errorCode.isFailure()) { return; }
1270 delete coll;
1271 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1272 if(errorCode.isFailure()) {
1273 dataerrln("unable to create a root collator");
1274 return;
1275 }
1276 }
1277
setLocaleCollator(IcuTestErrorCode & errorCode)1278 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1279 if(errorCode.isFailure()) { return; }
1280 delete coll;
1281 coll = NULL;
1282 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant
1283 if(at >= 0) {
1284 fileLine.setCharAt(at, (UChar)0x2a); // *
1285 }
1286 CharString localeID;
1287 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1288 if(at >= 0) {
1289 localeID.data()[at - 9] = '@';
1290 }
1291 Locale locale(localeID.data());
1292 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1293 errln("invalid language tag on line %d", (int)fileLineNumber);
1294 infoln(fileLine);
1295 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1296 return;
1297 }
1298
1299 logln("creating a collator for locale ID %s", locale.getName());
1300 coll = Collator::createInstance(locale, errorCode);
1301 if(errorCode.isFailure()) {
1302 dataerrln("unable to create a collator for locale %s on line %d",
1303 locale.getName(), (int)fileLineNumber);
1304 infoln(fileLine);
1305 delete coll;
1306 coll = NULL;
1307 errorCode.reset();
1308 }
1309 }
1310
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1311 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1312 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1313 // In some sequences with Tibetan composite vowel signs,
1314 // even if the string passes the FCD check,
1315 // those composites must be decomposed.
1316 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1317 int32_t index = 0;
1318 while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1319 if(++index < s.length()) {
1320 UChar c = s[index];
1321 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1322 }
1323 }
1324 return FALSE;
1325 }
1326
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1327 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1328 CharString &dest, int32_t partSize,
1329 IcuTestErrorCode &errorCode) {
1330 if(errorCode.isFailure()) { return FALSE; }
1331 uint8_t part[32];
1332 U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1333 UCharIterator iter;
1334 uiter_setString(&iter, s, length);
1335 uint32_t state[2] = { 0, 0 };
1336 for(;;) {
1337 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1338 UBool done = partLength < partSize;
1339 if(done) {
1340 // At the end, append the next byte as well which should be 00.
1341 ++partLength;
1342 }
1343 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1344 if(done) {
1345 return errorCode.isSuccess();
1346 }
1347 }
1348 }
1349
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1350 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1351 const UChar *s, int32_t length,
1352 CollationKey &key, IcuTestErrorCode &errorCode) {
1353 if(errorCode.isFailure()) { return FALSE; }
1354 coll->getCollationKey(s, length, key, errorCode);
1355 if(errorCode.isFailure()) {
1356 infoln(fileTestName);
1357 errln("Collator(%s).getCollationKey() failed: %s",
1358 norm, errorCode.errorName());
1359 infoln(line);
1360 return FALSE;
1361 }
1362 int32_t keyLength;
1363 const uint8_t *keyBytes = key.getByteArray(keyLength);
1364 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1365 infoln(fileTestName);
1366 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1367 norm);
1368 infoln(line);
1369 infoln(printCollationKey(key));
1370 return FALSE;
1371 }
1372
1373 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1374 if(numLevels < UCOL_IDENTICAL) {
1375 ++numLevels;
1376 } else {
1377 numLevels = 5;
1378 }
1379 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1380 ++numLevels;
1381 }
1382 errorCode.assertSuccess();
1383 int32_t numLevelSeparators = 0;
1384 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1385 uint8_t b = keyBytes[i];
1386 if(b == 0) {
1387 infoln(fileTestName);
1388 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1389 infoln(line);
1390 infoln(printCollationKey(key));
1391 return FALSE;
1392 }
1393 if(b == 1) { ++numLevelSeparators; }
1394 }
1395 if(numLevelSeparators != (numLevels - 1)) {
1396 infoln(fileTestName);
1397 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1398 norm, (int)numLevelSeparators, (int)numLevels);
1399 infoln(line);
1400 infoln(printCollationKey(key));
1401 return FALSE;
1402 }
1403
1404 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1405 static const int32_t partSizes[] = { 32, 3, 1 };
1406 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1407 int32_t partSize = partSizes[psi];
1408 CharString parts;
1409 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1410 infoln(fileTestName);
1411 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1412 norm, (int)partSize, errorCode.errorName());
1413 infoln(line);
1414 return FALSE;
1415 }
1416 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1417 infoln(fileTestName);
1418 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1419 norm, (int)partSize);
1420 infoln(line);
1421 infoln(printCollationKey(key));
1422 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1423 return FALSE;
1424 }
1425 }
1426 return TRUE;
1427 }
1428
1429 /**
1430 * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1431 * Leaves key unchanged if s does not contain U+FFFE.
1432 * @return TRUE if the key was successfully changed
1433 */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1434 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1435 CollationKey &key, IcuTestErrorCode &errorCode) {
1436 if(errorCode.isFailure()) { return FALSE; }
1437 LocalMemory<uint8_t> mergedKey;
1438 int32_t mergedKeyLength = 0;
1439 int32_t mergedKeyCapacity = 0;
1440 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1441 int32_t segmentStart = 0;
1442 for(int32_t i = 0;;) {
1443 if(i == sLength) {
1444 if(segmentStart == 0) {
1445 // s does not contain any U+FFFE.
1446 return FALSE;
1447 }
1448 } else if(s[i] != 0xfffe) {
1449 ++i;
1450 continue;
1451 }
1452 // Get the sort key for another segment and merge it into mergedKey.
1453 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1454 CollationKey key2;
1455 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1456 int32_t key1Length, key2Length;
1457 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1458 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1459 uint8_t *dest;
1460 int32_t minCapacity = key1Length + key2Length;
1461 if(key1Length > 0) { --minCapacity; }
1462 if(minCapacity <= mergedKeyCapacity) {
1463 dest = mergedKey.getAlias();
1464 } else {
1465 if(minCapacity <= 200) {
1466 mergedKeyCapacity = 200;
1467 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1468 mergedKeyCapacity *= 2;
1469 } else {
1470 mergedKeyCapacity = minCapacity;
1471 }
1472 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1473 }
1474 U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1475 if(key1Length == 0) {
1476 // key2 is the sort key for the first segment.
1477 uprv_memcpy(dest, key2Bytes, key2Length);
1478 mergedKeyLength = key2Length;
1479 } else {
1480 mergedKeyLength =
1481 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1482 dest, mergedKeyCapacity);
1483 }
1484 if(i == sLength) { break; }
1485 segmentStart = ++i;
1486 }
1487 key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1488 return TRUE;
1489 }
1490
1491 namespace {
1492
1493 /**
1494 * Replaces unpaired surrogates with U+FFFD.
1495 * Returns s if no replacement was made, otherwise buffer.
1496 */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1497 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1498 int32_t i = 0;
1499 while(i < s.length()) {
1500 UChar32 c = s.char32At(i);
1501 if(U_IS_SURROGATE(c)) {
1502 if(buffer.length() < i) {
1503 buffer.append(s, buffer.length(), i - buffer.length());
1504 }
1505 buffer.append((UChar)0xfffd);
1506 }
1507 i += U16_LENGTH(c);
1508 }
1509 if(buffer.isEmpty()) {
1510 return s;
1511 }
1512 if(buffer.length() < i) {
1513 buffer.append(s, buffer.length(), i - buffer.length());
1514 }
1515 return buffer;
1516 }
1517
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1518 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1519 UCollationResult order, UBool collHasCaseLevel) {
1520 if(order == UCOL_EQUAL) {
1521 return Collation::NO_LEVEL;
1522 }
1523 int32_t prevKeyLength;
1524 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1525 int32_t keyLength;
1526 const uint8_t *bytes = key.getByteArray(keyLength);
1527 int32_t level = Collation::PRIMARY_LEVEL;
1528 for(int32_t i = 0;; ++i) {
1529 uint8_t b = prevBytes[i];
1530 if(b != bytes[i]) { break; }
1531 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1532 ++level;
1533 if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1534 ++level;
1535 }
1536 }
1537 }
1538 return level;
1539 }
1540
1541 }
1542
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1543 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1544 const UnicodeString &prevString, const UnicodeString &s,
1545 UCollationResult expectedOrder, Collation::Level expectedLevel,
1546 IcuTestErrorCode &errorCode) {
1547 if(errorCode.isFailure()) { return FALSE; }
1548
1549 // Get the sort keys first, for error debug output.
1550 CollationKey prevKey;
1551 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1552 prevKey, errorCode)) {
1553 return FALSE;
1554 }
1555 CollationKey key;
1556 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1557
1558 UCollationResult order = coll->compare(prevString, s, errorCode);
1559 if(order != expectedOrder || errorCode.isFailure()) {
1560 infoln(fileTestName);
1561 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1562 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1563 infoln(prevFileLine);
1564 infoln(fileLine);
1565 infoln(printCollationKey(prevKey));
1566 infoln(printCollationKey(key));
1567 return FALSE;
1568 }
1569 order = coll->compare(s, prevString, errorCode);
1570 if(order != -expectedOrder || errorCode.isFailure()) {
1571 infoln(fileTestName);
1572 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1573 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1574 infoln(prevFileLine);
1575 infoln(fileLine);
1576 infoln(printCollationKey(prevKey));
1577 infoln(printCollationKey(key));
1578 return FALSE;
1579 }
1580 // Test NUL-termination if the strings do not contain NUL characters.
1581 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1582 if(!containNUL) {
1583 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1584 if(order != expectedOrder || errorCode.isFailure()) {
1585 infoln(fileTestName);
1586 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1587 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1588 infoln(prevFileLine);
1589 infoln(fileLine);
1590 infoln(printCollationKey(prevKey));
1591 infoln(printCollationKey(key));
1592 return FALSE;
1593 }
1594 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1595 if(order != -expectedOrder || errorCode.isFailure()) {
1596 infoln(fileTestName);
1597 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1598 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1599 infoln(prevFileLine);
1600 infoln(fileLine);
1601 infoln(printCollationKey(prevKey));
1602 infoln(printCollationKey(key));
1603 return FALSE;
1604 }
1605 }
1606
1607 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1608 // Unpaired surrogates cannot be converted to UTF-8.
1609 // Create valid UTF-16 strings if necessary, and use those for
1610 // both the expected compare() result and for the input to compare(UTF-8).
1611 UnicodeString prevBuffer, sBuffer;
1612 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1613 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1614 std::string prevUTF8, sUTF8;
1615 UnicodeString(prevValid).toUTF8String(prevUTF8);
1616 UnicodeString(sValid).toUTF8String(sUTF8);
1617 UCollationResult expectedUTF8Order;
1618 if(&prevValid == &prevString && &sValid == &s) {
1619 expectedUTF8Order = expectedOrder;
1620 } else {
1621 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1622 }
1623
1624 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1625 if(order != expectedUTF8Order || errorCode.isFailure()) {
1626 infoln(fileTestName);
1627 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1628 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1629 infoln(prevFileLine);
1630 infoln(fileLine);
1631 infoln(printCollationKey(prevKey));
1632 infoln(printCollationKey(key));
1633 return FALSE;
1634 }
1635 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1636 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1637 infoln(fileTestName);
1638 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1639 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1640 infoln(prevFileLine);
1641 infoln(fileLine);
1642 infoln(printCollationKey(prevKey));
1643 infoln(printCollationKey(key));
1644 return FALSE;
1645 }
1646 // Test NUL-termination if the strings do not contain NUL characters.
1647 if(!containNUL) {
1648 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1649 if(order != expectedUTF8Order || errorCode.isFailure()) {
1650 infoln(fileTestName);
1651 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1652 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1653 infoln(prevFileLine);
1654 infoln(fileLine);
1655 infoln(printCollationKey(prevKey));
1656 infoln(printCollationKey(key));
1657 return FALSE;
1658 }
1659 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1660 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1661 infoln(fileTestName);
1662 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1663 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1664 infoln(prevFileLine);
1665 infoln(fileLine);
1666 infoln(printCollationKey(prevKey));
1667 infoln(printCollationKey(key));
1668 return FALSE;
1669 }
1670 }
1671
1672 UCharIterator leftIter;
1673 UCharIterator rightIter;
1674 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1675 uiter_setString(&rightIter, s.getBuffer(), s.length());
1676 order = coll->compare(leftIter, rightIter, errorCode);
1677 if(order != expectedOrder || errorCode.isFailure()) {
1678 infoln(fileTestName);
1679 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1680 "wrong order: %d != %d (%s)",
1681 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1682 infoln(prevFileLine);
1683 infoln(fileLine);
1684 infoln(printCollationKey(prevKey));
1685 infoln(printCollationKey(key));
1686 return FALSE;
1687 }
1688
1689 order = prevKey.compareTo(key, errorCode);
1690 if(order != expectedOrder || errorCode.isFailure()) {
1691 infoln(fileTestName);
1692 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1693 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1694 infoln(prevFileLine);
1695 infoln(fileLine);
1696 infoln(printCollationKey(prevKey));
1697 infoln(printCollationKey(key));
1698 return FALSE;
1699 }
1700 UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1701 int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1702 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1703 if(level != expectedLevel) {
1704 infoln(fileTestName);
1705 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1706 (int)fileLineNumber, norm, order, level, expectedLevel);
1707 infoln(prevFileLine);
1708 infoln(fileLine);
1709 infoln(printCollationKey(prevKey));
1710 infoln(printCollationKey(key));
1711 return FALSE;
1712 }
1713 }
1714
1715 // If either string contains U+FFFE, then their sort keys must compare the same as
1716 // the merged sort keys of each string's between-FFFE segments.
1717 //
1718 // It is not required that
1719 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1720 // only that those two methods yield the same order.
1721 //
1722 // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1723 if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1724 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1725 errorCode.isFailure()) {
1726 order = prevKey.compareTo(key, errorCode);
1727 if(order != expectedOrder || errorCode.isFailure()) {
1728 infoln(fileTestName);
1729 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1730 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1731 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1732 infoln(prevFileLine);
1733 infoln(fileLine);
1734 infoln(printCollationKey(prevKey));
1735 infoln(printCollationKey(key));
1736 return FALSE;
1737 }
1738 int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1739 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1740 if(mergedLevel != level) {
1741 infoln(fileTestName);
1742 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1743 "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1744 (int)fileLineNumber, norm, order, mergedLevel, level);
1745 infoln(prevFileLine);
1746 infoln(fileLine);
1747 infoln(printCollationKey(prevKey));
1748 infoln(printCollationKey(key));
1749 return FALSE;
1750 }
1751 }
1752 }
1753 return TRUE;
1754 }
1755
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1756 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1757 if(errorCode.isFailure()) { return; }
1758 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1759 UnicodeString prevString, s;
1760 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1761 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1762 // Parse the line even if it will be ignored (when we do not have a Collator)
1763 // in order to report syntax issues.
1764 Collation::Level relation = parseRelationAndString(s, errorCode);
1765 if(errorCode.isFailure()) {
1766 errorCode.reset();
1767 break;
1768 }
1769 if(coll == NULL) {
1770 // We were unable to create the Collator but continue with tests.
1771 // Ignore test data for this Collator.
1772 // The next Collator creation might work.
1773 continue;
1774 }
1775 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1776 Collation::Level expectedLevel = relation;
1777 s.getTerminatedBuffer(); // Ensure NUL-termination.
1778 UBool isOk = TRUE;
1779 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1780 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1781 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1782 expectedOrder, expectedLevel, errorCode);
1783 }
1784 if(isOk) {
1785 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1786 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1787 expectedOrder, expectedLevel, errorCode);
1788 }
1789 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1790 UnicodeString pn = nfd->normalize(prevString, errorCode);
1791 UnicodeString n = nfd->normalize(s, errorCode);
1792 pn.getTerminatedBuffer();
1793 n.getTerminatedBuffer();
1794 errorCode.assertSuccess();
1795 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1796 expectedOrder, expectedLevel, errorCode);
1797 }
1798 if(!isOk) {
1799 errorCode.reset(); // already reported
1800 }
1801 prevFileLine = fileLine;
1802 prevString = s;
1803 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1804 }
1805 }
1806
TestDataDriven()1807 void CollationTest::TestDataDriven() {
1808 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1809
1810 fcd = Normalizer2Factory::getFCDInstance(errorCode);
1811 nfd = Normalizer2::getNFDInstance(errorCode);
1812 if(errorCode.errDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1813 return;
1814 }
1815
1816 CharString path(getSourceTestData(errorCode), errorCode);
1817 path.appendPathPart("collationtest.txt", errorCode);
1818 const char *codePage = "UTF-8";
1819 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1820 if(errorCode.errIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1821 return;
1822 }
1823 // Read a new line if necessary.
1824 // Sub-parsers leave the first line set that they do not handle.
1825 while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1826 if(!isSectionStarter(fileLine[0])) {
1827 errln("syntax error on line %d", (int)fileLineNumber);
1828 infoln(fileLine);
1829 return;
1830 }
1831 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1832 fileTestName = fileLine;
1833 logln(fileLine);
1834 fileLine.remove();
1835 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1836 setRootCollator(errorCode);
1837 fileLine.remove();
1838 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1839 setLocaleCollator(errorCode);
1840 fileLine.remove();
1841 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1842 buildTailoring(f.getAlias(), errorCode);
1843 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1844 parseAndSetAttribute(errorCode);
1845 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1846 checkCompareStrings(f.getAlias(), errorCode);
1847 } else {
1848 errln("syntax error on line %d", (int)fileLineNumber);
1849 infoln(fileLine);
1850 return;
1851 }
1852 }
1853 }
1854
1855 #endif // !UCONFIG_NO_COLLATION
1856