1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/stringpiece.h"
26 #include "unicode/tblcoll.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uniset.h"
29 #include "unicode/unistr.h"
30 #include "unicode/usetiter.h"
31 #include "unicode/ustring.h"
32 #include "charstr.h"
33 #include "cmemory.h"
34 #include "collation.h"
35 #include "collationdata.h"
36 #include "collationfcd.h"
37 #include "collationiterator.h"
38 #include "collationroot.h"
39 #include "collationrootelements.h"
40 #include "collationruleparser.h"
41 #include "collationweights.h"
42 #include "cstring.h"
43 #include "intltest.h"
44 #include "normalizer2impl.h"
45 #include "ucbuf.h"
46 #include "uhash.h"
47 #include "uitercollationiterator.h"
48 #include "utf16collationiterator.h"
49 #include "utf8collationiterator.h"
50 #include "uvectr32.h"
51 #include "uvectr64.h"
52 #include "writesrc.h"
53
54 class CodePointIterator;
55
56 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
57
58 class CollationTest : public IntlTest {
59 public:
CollationTest()60 CollationTest()
61 : fcd(NULL), nfd(NULL),
62 fileLineNumber(0),
63 coll(NULL) {}
64
~CollationTest()65 ~CollationTest() {
66 delete coll;
67 }
68
69 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
70
71 void TestMinMax();
72 void TestImplicits();
73 void TestNulTerminated();
74 void TestIllegalUTF8();
75 void TestShortFCDData();
76 void TestFCD();
77 void TestCollationWeights();
78 void TestRootElements();
79 void TestTailoredElements();
80 void TestDataDriven();
81 void TestLongLocale();
82
83 private:
84 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
85 void checkAllocWeights(CollationWeights &cw,
86 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
87 int32_t someLength, int32_t minCount);
88
89 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
90 static UnicodeString printCollationKey(const CollationKey &key);
91
92 // Helpers & fields for data-driven test.
isCROrLF(UChar c)93 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)94 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)95 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
skipSpaces(int32_t i)96 int32_t skipSpaces(int32_t i) {
97 while(isSpace(fileLine[i])) { ++i; }
98 return i;
99 }
100
101 UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
102 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
103 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
104 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
105 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
106 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
107 void setRootCollator(IcuTestErrorCode &errorCode);
108 void setLocaleCollator(IcuTestErrorCode &errorCode);
109
110 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
111
112 UBool getSortKeyParts(const UChar *s, int32_t length,
113 CharString &dest, int32_t partSize,
114 IcuTestErrorCode &errorCode);
115 UBool getCollationKey(const char *norm, const UnicodeString &line,
116 const UChar *s, int32_t length,
117 CollationKey &key, IcuTestErrorCode &errorCode);
118 UBool getMergedCollationKey(const UChar *s, int32_t length,
119 CollationKey &key, IcuTestErrorCode &errorCode);
120 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
121 const UnicodeString &prevString, const UnicodeString &s,
122 UCollationResult expectedOrder, Collation::Level expectedLevel,
123 IcuTestErrorCode &errorCode);
124 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
125
126 const Normalizer2 *fcd, *nfd;
127 UnicodeString fileLine;
128 int32_t fileLineNumber;
129 UnicodeString fileTestName;
130 Collator *coll;
131 };
132
createCollationTest()133 extern IntlTest *createCollationTest() {
134 return new CollationTest();
135 }
136
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)137 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
138 if(exec) {
139 logln("TestSuite CollationTest: ");
140 }
141 TESTCASE_AUTO_BEGIN;
142 TESTCASE_AUTO(TestMinMax);
143 TESTCASE_AUTO(TestImplicits);
144 TESTCASE_AUTO(TestNulTerminated);
145 TESTCASE_AUTO(TestIllegalUTF8);
146 TESTCASE_AUTO(TestShortFCDData);
147 TESTCASE_AUTO(TestFCD);
148 TESTCASE_AUTO(TestCollationWeights);
149 TESTCASE_AUTO(TestRootElements);
150 TESTCASE_AUTO(TestTailoredElements);
151 TESTCASE_AUTO(TestDataDriven);
152 TESTCASE_AUTO(TestLongLocale);
153 TESTCASE_AUTO_END;
154 }
155
TestMinMax()156 void CollationTest::TestMinMax() {
157 IcuTestErrorCode errorCode(*this, "TestMinMax");
158
159 setRootCollator(errorCode);
160 if(errorCode.isFailure()) {
161 errorCode.reset();
162 return;
163 }
164 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
165 if(rbc == NULL) {
166 errln("the root collator is not a RuleBasedCollator");
167 return;
168 }
169
170 static const UChar s[2] = { 0xfffe, 0xffff };
171 UVector64 ces(errorCode);
172 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
173 errorCode.assertSuccess();
174 if(ces.size() != 2) {
175 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
176 return;
177 }
178 int64_t ce = ces.elementAti(0);
179 int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
180 if(ce != expected) {
181 errln("CE(U+fffe)=%04lx != 02..", (long)ce);
182 }
183
184 ce = ces.elementAti(1);
185 expected = Collation::makeCE(Collation::MAX_PRIMARY);
186 if(ce != expected) {
187 errln("CE(U+ffff)=%04lx != max..", (long)ce);
188 }
189 }
190
TestImplicits()191 void CollationTest::TestImplicits() {
192 IcuTestErrorCode errorCode(*this, "TestImplicits");
193
194 const CollationData *cd = CollationRoot::getData(errorCode);
195 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
196 return;
197 }
198
199 // Implicit primary weights should be assigned for the following sets,
200 // and sort in ascending order by set and then code point.
201 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
202
203 // core Han Unified Ideographs
204 UnicodeSet coreHan("[\\p{unified_ideograph}&"
205 "[\\p{Block=CJK_Unified_Ideographs}"
206 "\\p{Block=CJK_Compatibility_Ideographs}]]",
207 errorCode);
208 // all other Unified Han ideographs
209 UnicodeSet otherHan("[\\p{unified ideograph}-"
210 "[\\p{Block=CJK_Unified_Ideographs}"
211 "\\p{Block=CJK_Compatibility_Ideographs}]]",
212 errorCode);
213 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
214 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
215
216 // Starting with CLDR 26/ICU 54, the root Han order may instead be
217 // the Unihan radical-stroke order.
218 // The tests should pass either way, so we only test the order of a small set of Han characters
219 // whose radical-stroke order is the same as their code point order.
220 UnicodeSet someHanInCPOrder(
221 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
222 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
223 errorCode);
224 UnicodeSet inOrder(someHanInCPOrder);
225 inOrder.addAll(unassigned).freeze();
226 if(errorCode.errIfFailureAndReset("UnicodeSet")) {
227 return;
228 }
229 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
230 UChar32 prev = 0;
231 uint32_t prevPrimary = 0;
232 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
233 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
234 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
235 while(iter->next()) {
236 UChar32 c = iter->getCodepoint();
237 UnicodeString s(c);
238 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
239 int64_t ce = ci.nextCE(errorCode);
240 int64_t ce2 = ci.nextCE(errorCode);
241 if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
242 return;
243 }
244 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
245 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
246 continue;
247 }
248 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
249 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
250 (long)c, (long)(ce & 0xffffffff));
251 continue;
252 }
253 uint32_t primary = (uint32_t)(ce >> 32);
254 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
255 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
256 (long)c, (long)primary, (long)prev, (long)prevPrimary);
257 }
258 prev = c;
259 prevPrimary = primary;
260 }
261 }
262 }
263
TestNulTerminated()264 void CollationTest::TestNulTerminated() {
265 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
266 const CollationData *data = CollationRoot::getData(errorCode);
267 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
268 return;
269 }
270
271 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
272
273 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
274 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
275 for(int32_t i = 0;; ++i) {
276 int64_t ce1 = ci1.nextCE(errorCode);
277 int64_t ce2 = ci2.nextCE(errorCode);
278 if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
279 return;
280 }
281 if(ce1 != ce2) {
282 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
283 break;
284 }
285 if(ce1 == Collation::NO_CE) { break; }
286 }
287 }
288
TestIllegalUTF8()289 void CollationTest::TestIllegalUTF8() {
290 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
291
292 setRootCollator(errorCode);
293 if(errorCode.isFailure()) {
294 errorCode.reset();
295 return;
296 }
297 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
298
299 static const StringPiece strings[] = {
300 // string with U+FFFD == illegal byte sequence
301 u8"a\uFFFDz", "a\x80z", // trail byte
302 u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form
303 u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form
304 u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800
305 u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
306 u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form
307 u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000
308 };
309
310 for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
311 StringPiece fffd(strings[i]);
312 StringPiece illegal(strings[i + 1]);
313 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
314 if(order != UCOL_EQUAL) {
315 errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
316 (int)i, order);
317 }
318 }
319 }
320
321 namespace {
322
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)323 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
324 for(UChar32 c = 0x10000; c < 0x110000;) {
325 UChar32 next = c + 0x400;
326 if(src.containsSome(c, next - 1)) {
327 dest.add(U16_LEAD(c));
328 }
329 c = next;
330 }
331 }
332
333 } // namespace
334
TestShortFCDData()335 void CollationTest::TestShortFCDData() {
336 // See CollationFCD class comments.
337 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
338 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
339 errorCode.assertSuccess();
340 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
341 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
342 UnicodeSet lccc; // actual
343 for(UChar32 c = 0; c <= 0xffff; ++c) {
344 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
345 }
346 UnicodeSet diff(expectedLccc);
347 diff.removeAll(lccc);
348 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
349 UnicodeString empty("[]");
350 UnicodeString diffString;
351 diff.toPattern(diffString, TRUE);
352 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
353 diff = lccc;
354 diff.removeAll(expectedLccc);
355 diff.toPattern(diffString, TRUE);
356 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
357
358 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
359 if (errorCode.isSuccess()) {
360 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
361 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
362 UnicodeSet tccc; // actual
363 for(UChar32 c = 0; c <= 0xffff; ++c) {
364 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
365 }
366 diff = expectedTccc;
367 diff.removeAll(tccc);
368 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
369 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
370 diff = tccc;
371 diff.removeAll(expectedTccc);
372 diff.toPattern(diffString, TRUE);
373 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
374 }
375 }
376
377 class CodePointIterator {
378 public:
CodePointIterator(const UChar32 * cp,int32_t length)379 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()380 void resetToStart() { pos = 0; }
next()381 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()382 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const383 int32_t getLength() const { return length; }
getIndex() const384 int getIndex() const { return (int)pos; }
385 private:
386 const UChar32 *cp;
387 int32_t length;
388 int32_t pos;
389 };
390
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)391 void CollationTest::checkFCD(const char *name,
392 CollationIterator &ci, CodePointIterator &cpi) {
393 IcuTestErrorCode errorCode(*this, "checkFCD");
394
395 // Iterate forward to the limit.
396 for(;;) {
397 UChar32 c1 = ci.nextCodePoint(errorCode);
398 UChar32 c2 = cpi.next();
399 if(c1 != c2) {
400 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
401 name, (long)c1, (long)c2, cpi.getIndex());
402 return;
403 }
404 if(c1 < 0) { break; }
405 }
406
407 // Iterate backward most of the way.
408 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
409 UChar32 c1 = ci.previousCodePoint(errorCode);
410 UChar32 c2 = cpi.previous();
411 if(c1 != c2) {
412 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
413 name, (long)c1, (long)c2, cpi.getIndex());
414 return;
415 }
416 }
417
418 // Forward again.
419 for(;;) {
420 UChar32 c1 = ci.nextCodePoint(errorCode);
421 UChar32 c2 = cpi.next();
422 if(c1 != c2) {
423 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
424 name, (long)c1, (long)c2, cpi.getIndex());
425 return;
426 }
427 if(c1 < 0) { break; }
428 }
429
430 // Iterate backward to the start.
431 for(;;) {
432 UChar32 c1 = ci.previousCodePoint(errorCode);
433 UChar32 c2 = cpi.previous();
434 if(c1 != c2) {
435 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
436 name, (long)c1, (long)c2, cpi.getIndex());
437 return;
438 }
439 if(c1 < 0) { break; }
440 }
441 }
442
TestFCD()443 void CollationTest::TestFCD() {
444 IcuTestErrorCode errorCode(*this, "TestFCD");
445 const CollationData *data = CollationRoot::getData(errorCode);
446 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
447 return;
448 }
449
450 // Input string, not FCD, NUL-terminated.
451 static const UChar s[] = {
452 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
453 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
454 0x327, 0x308, // ccc=202, 230
455 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
456 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
457 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
458 0xac01,
459 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
460 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
461 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
462 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
463 0x4e00, 0xf81,
464 0
465 };
466 // Expected code points.
467 static const UChar32 cp[] = {
468 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
469 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
470 0x1D15F, 0x1D16D,
471 0xac01,
472 0x63, 0x327, 0x1D165, 0x1D16D,
473 0x61,
474 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
475 0x4e00, 0xf71, 0xf80
476 };
477
478 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
479 if(errorCode.errIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
480 return;
481 }
482 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
483 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
484
485 cpi.resetToStart();
486 std::string utf8;
487 UnicodeString(s).toUTF8String(utf8);
488 FCDUTF8CollationIterator u8ci(data, FALSE,
489 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
490 if(errorCode.errIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
491 return;
492 }
493 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
494
495 cpi.resetToStart();
496 UCharIterator iter;
497 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the terminating NUL
498 FCDUIterCollationIterator uici(data, FALSE, iter, 0);
499 if(errorCode.errIfFailureAndReset("FCDUIterCollationIterator constructor")) {
500 return;
501 }
502 checkFCD("FCDUIterCollationIterator", uici, cpi);
503 }
504
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)505 void CollationTest::checkAllocWeights(CollationWeights &cw,
506 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
507 int32_t someLength, int32_t minCount) {
508 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
509 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
510 (long)lowerLimit, (long)upperLimit, (long)n);
511 return;
512 }
513 uint32_t previous = lowerLimit;
514 int32_t count = 0; // number of weights that have someLength
515 for(int32_t i = 0; i < n; ++i) {
516 uint32_t w = cw.nextWeight();
517 if(w == 0xffffffff) {
518 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
519 "returns only %ld weights",
520 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
521 return;
522 }
523 if(!(previous < w && w < upperLimit)) {
524 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
525 "number %ld -> %lx not between %lx and %lx",
526 (long)lowerLimit, (long)upperLimit, (long)n,
527 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
528 return;
529 }
530 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
531 }
532 if(count < minCount) {
533 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
534 "returns only %ld < %ld weights of length %d",
535 (long)lowerLimit, (long)upperLimit, (long)n,
536 (long)count, (long)minCount, (int)someLength);
537 }
538 }
539
TestCollationWeights()540 void CollationTest::TestCollationWeights() {
541 CollationWeights cw;
542
543 // Non-compressible primaries use 254 second bytes 02..FF.
544 logln("CollationWeights.initForPrimary(non-compressible)");
545 cw.initForPrimary(FALSE);
546 // Expect 1 weight 11 and 254 weights 12xx.
547 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
548 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
549 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
550 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
551 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
552 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
553 // Expect 254^2=64516 three-byte weights.
554 // During computation, there should be 3 three-byte ranges
555 // 10ffff, 11xxxx, 120202.
556 // The middle one should be split 64515:1,
557 // and the newly-split-off range and the last ranged lengthened.
558 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
559 // Expect weights 1102 & 1103.
560 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
561 // Expect weights 102102 & 102103.
562 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
563
564 // Compressible primaries use 251 second bytes 04..FE.
565 logln("CollationWeights.initForPrimary(compressible)");
566 cw.initForPrimary(TRUE);
567 // Expect 1 weight 11 and 251 weights 12xx.
568 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
569 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
570 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
571 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
572 // Expect weights 1104 & 1105.
573 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
574 // Expect weights 102102 & 102103.
575 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
576
577 // Secondary and tertiary weights use only bytes 3 & 4.
578 logln("CollationWeights.initForSecondary()");
579 cw.initForSecondary();
580 // Expect weights fbxx and all four fc..ff.
581 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
582
583 logln("CollationWeights.initForTertiary()");
584 cw.initForTertiary();
585 // Expect weights 3dxx and both 3e & 3f.
586 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
587 }
588
589 namespace {
590
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)591 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
592 uint32_t p, uint32_t s, uint32_t ctq) {
593 uint32_t p1 = p >> 24;
594 uint32_t p2 = (p >> 16) & 0xff;
595 uint32_t p3 = (p >> 8) & 0xff;
596 uint32_t p4 = p & 0xff;
597 uint32_t s1 = s >> 8;
598 uint32_t s2 = s & 0xff;
599 // ctq = Case, Tertiary, Quaternary
600 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
601 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
602 uint32_t t1 = t >> 8;
603 uint32_t t2 = t & 0xff;
604 uint32_t q = ctq & Collation::QUATERNARY_MASK;
605 // No leading zero bytes.
606 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
607 return FALSE;
608 }
609 // No intermediate zero bytes.
610 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
611 return FALSE;
612 }
613 if(p2 != 0 && p3 == 0 && p4 != 0) {
614 return FALSE;
615 }
616 // Minimum & maximum lead bytes.
617 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
618 s1 == Collation::LEVEL_SEPARATOR_BYTE ||
619 t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
620 return FALSE;
621 }
622 if(c > 2) {
623 return FALSE;
624 }
625 // The valid byte range for the second primary byte depends on compressibility.
626 if(p2 != 0) {
627 if(data.isCompressibleLeadByte(p1)) {
628 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
629 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
630 return FALSE;
631 }
632 } else {
633 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
634 return FALSE;
635 }
636 }
637 }
638 // Other bytes just need to avoid the level separator.
639 // Trailing zeros are ok.
640 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
641 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
642 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
643 return FALSE;
644 }
645 // Well-formed CEs.
646 if(p == 0) {
647 if(s == 0) {
648 if(t == 0) {
649 // Completely ignorable CE.
650 // Quaternary CEs are not supported.
651 if(c != 0 || q != 0) {
652 return FALSE;
653 }
654 } else {
655 // Tertiary CE.
656 if(t < re.getTertiaryBoundary() || c != 2) {
657 return FALSE;
658 }
659 }
660 } else {
661 // Secondary CE.
662 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
663 return FALSE;
664 }
665 }
666 } else {
667 // Primary CE.
668 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
669 s >= re.getSecondaryBoundary()) {
670 return FALSE;
671 }
672 if(t == 0 || t >= re.getTertiaryBoundary()) {
673 return FALSE;
674 }
675 }
676 return TRUE;
677 }
678
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)679 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
680 uint32_t p = (uint32_t)(ce >> 32);
681 uint32_t secTer = (uint32_t)ce;
682 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
683 }
684
685 class RootElementsIterator {
686 public:
RootElementsIterator(const CollationData & root)687 RootElementsIterator(const CollationData &root)
688 : data(root),
689 elements(root.rootElements), length(root.rootElementsLength),
690 pri(0), secTer(0),
691 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
692
next()693 UBool next() {
694 if(index >= length) { return FALSE; }
695 uint32_t p = elements[index];
696 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
697 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
698 ++index;
699 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
700 return TRUE;
701 }
702 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
703 // End of a range, enumerate the primaries in the range.
704 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
705 p &= 0xffffff00;
706 if(pri == p) {
707 // Finished the range, return the next CE after it.
708 ++index;
709 return next();
710 }
711 U_ASSERT(pri < p);
712 // Return the next primary in this range.
713 UBool isCompressible = data.isCompressiblePrimary(pri);
714 if((pri & 0xffff) == 0) {
715 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
716 } else {
717 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
718 }
719 return TRUE;
720 }
721 // Simple primary CE.
722 ++index;
723 pri = p;
724 // Does this have an explicit below-common sec/ter unit,
725 // or does it imply a common one?
726 if(index == length) {
727 secTer = Collation::COMMON_SEC_AND_TER_CE;
728 } else {
729 secTer = elements[index];
730 if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
731 // No sec/ter delta.
732 secTer = Collation::COMMON_SEC_AND_TER_CE;
733 } else {
734 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
735 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
736 // Implied sec/ter.
737 secTer = Collation::COMMON_SEC_AND_TER_CE;
738 } else {
739 // Explicit sec/ter below common/common.
740 ++index;
741 }
742 }
743 }
744 return TRUE;
745 }
746
getPrimary() const747 uint32_t getPrimary() const { return pri; }
getSecTer() const748 uint32_t getSecTer() const { return secTer; }
749
750 private:
751 const CollationData &data;
752 const uint32_t *elements;
753 int32_t length;
754
755 uint32_t pri;
756 uint32_t secTer;
757 int32_t index;
758 };
759
760 } // namespace
761
TestRootElements()762 void CollationTest::TestRootElements() {
763 IcuTestErrorCode errorCode(*this, "TestRootElements");
764 const CollationData *root = CollationRoot::getData(errorCode);
765 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
766 return;
767 }
768 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
769 RootElementsIterator iter(*root);
770
771 // We check each root CE for validity,
772 // and we also verify that there is a tailoring gap between each two CEs.
773 CollationWeights cw1c; // compressible primary weights
774 CollationWeights cw1u; // uncompressible primary weights
775 CollationWeights cw2;
776 CollationWeights cw3;
777
778 cw1c.initForPrimary(TRUE);
779 cw1u.initForPrimary(FALSE);
780 cw2.initForSecondary();
781 cw3.initForTertiary();
782
783 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
784 // nor the special merge-separator CE for U+FFFE.
785 uint32_t prevPri = 0;
786 uint32_t prevSec = 0;
787 uint32_t prevTer = 0;
788 while(iter.next()) {
789 uint32_t pri = iter.getPrimary();
790 uint32_t secTer = iter.getSecTer();
791 // CollationRootElements CEs must have 0 case and quaternary bits.
792 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
793 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
794 (long)pri, (long)secTer);
795 }
796 uint32_t sec = secTer >> 16;
797 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
798 uint32_t ctq = ter;
799 if(pri == 0 && sec == 0 && ter != 0) {
800 // Tertiary CEs must have uppercase bits,
801 // but they are not stored in the CollationRootElements.
802 ctq |= 0x8000;
803 }
804 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
805 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
806 } else {
807 if(pri != prevPri) {
808 uint32_t newWeight = 0;
809 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
810 // There is currently no tailoring gap after primary ignorables,
811 // and we forbid tailoring after U+FFFD and U+FFFF.
812 } else if(root->isCompressiblePrimary(prevPri)) {
813 if(!cw1c.allocWeights(prevPri, pri, 1)) {
814 errln("no primary/compressible tailoring gap between %08lx and %08lx",
815 (long)prevPri, (long)pri);
816 } else {
817 newWeight = cw1c.nextWeight();
818 }
819 } else {
820 if(!cw1u.allocWeights(prevPri, pri, 1)) {
821 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
822 (long)prevPri, (long)pri);
823 } else {
824 newWeight = cw1u.nextWeight();
825 }
826 }
827 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
828 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
829 (long)prevPri, (long)newWeight, (long)pri);
830 }
831 } else if(sec != prevSec) {
832 uint32_t lowerLimit =
833 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
834 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
835 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
836 } else {
837 uint32_t newWeight = cw2.nextWeight();
838 if(!(prevSec < newWeight && newWeight < sec)) {
839 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
840 (long)lowerLimit, (long)newWeight, (long)sec);
841 }
842 }
843 } else if(ter != prevTer) {
844 uint32_t lowerLimit =
845 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
846 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
847 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
848 } else {
849 uint32_t newWeight = cw3.nextWeight();
850 if(!(prevTer < newWeight && newWeight < ter)) {
851 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
852 (long)lowerLimit, (long)newWeight, (long)ter);
853 }
854 }
855 } else {
856 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
857 }
858 }
859 prevPri = pri;
860 prevSec = sec;
861 prevTer = ter;
862 }
863 }
864
TestTailoredElements()865 void CollationTest::TestTailoredElements() {
866 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
867 const CollationData *root = CollationRoot::getData(errorCode);
868 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
869 return;
870 }
871 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
872
873 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
874 if(errorCode.errIfFailureAndReset("failed to create a hash table")) {
875 return;
876 }
877 uhash_setKeyDeleter(prevLocales, uprv_free);
878 // TestRootElements() tests the root collator which does not have tailorings.
879 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
880 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
881 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
882
883 UVector64 ces(errorCode);
884 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
885 U_ASSERT(locales.isValid());
886 const char *localeID = "root";
887 do {
888 Locale locale(localeID);
889 LocalPointer<StringEnumeration> types(
890 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
891 errorCode.assertSuccess();
892 const char *type; // first: default type
893 while((type = types->next(NULL, errorCode)) != NULL) {
894 if(strncmp(type, "private-", 8) == 0) {
895 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
896 localeID, type);
897 }
898 Locale localeWithType(locale);
899 localeWithType.setKeywordValue("collation", type, errorCode);
900 errorCode.assertSuccess();
901 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
902 if(errorCode.errIfFailureAndReset("Collator::createInstance(%s)",
903 localeWithType.getName())) {
904 continue;
905 }
906 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
907 if(uhash_geti(prevLocales, actual.getName()) != 0) {
908 continue;
909 }
910 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
911 errorCode.assertSuccess();
912 logln("TestTailoredElements(): requested %s -> actual %s",
913 localeWithType.getName(), actual.getName());
914 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
915 if(rbc == NULL) {
916 continue;
917 }
918 // Note: It would be better to get tailored strings such that we can
919 // identify the prefix, and only get the CEs for the prefix+string,
920 // not also for the prefix.
921 // There is currently no API for that.
922 // It would help in an unusual case where a contraction starting in the prefix
923 // extends past its end, and we do not see the intended mapping.
924 // For example, for a mapping p|st, if there is also a contraction ps,
925 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
926 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
927 errorCode.assertSuccess();
928 UnicodeSetIterator iter(*tailored);
929 while(iter.next()) {
930 const UnicodeString &s = iter.getString();
931 ces.removeAllElements();
932 rbc->internalGetCEs(s, ces, errorCode);
933 errorCode.assertSuccess();
934 for(int32_t i = 0; i < ces.size(); ++i) {
935 int64_t ce = ces.elementAti(i);
936 if(!isValidCE(rootElements, *root, ce)) {
937 errln("invalid tailored CE %016llx at CE index %d from string:",
938 (long long)ce, (int)i);
939 infoln(prettify(s));
940 }
941 }
942 }
943 }
944 } while((localeID = locales->next(NULL, errorCode)) != NULL);
945 uhash_close(prevLocales);
946 }
947
printSortKey(const uint8_t * p,int32_t length)948 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
949 UnicodeString s;
950 for(int32_t i = 0; i < length; ++i) {
951 if(i > 0) { s.append((UChar)0x20); }
952 uint8_t b = p[i];
953 if(b == 0) {
954 s.append((UChar)0x2e); // period
955 } else if(b == 1) {
956 s.append((UChar)0x7c); // vertical bar
957 } else {
958 appendHex(b, 2, s);
959 }
960 }
961 return s;
962 }
963
printCollationKey(const CollationKey & key)964 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
965 int32_t length;
966 const uint8_t *p = key.getByteArray(length);
967 return printSortKey(p, length);
968 }
969
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)970 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
971 for(;;) {
972 int32_t lineLength;
973 const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
974 if(line == NULL || errorCode.isFailure()) {
975 fileLine.remove();
976 return FALSE;
977 }
978 ++fileLineNumber;
979 // Strip trailing CR/LF, comments, and spaces.
980 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
981 if(comment != NULL) {
982 lineLength = (int32_t)(comment - line);
983 } else {
984 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
985 }
986 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
987 if(lineLength != 0) {
988 fileLine.setTo(FALSE, line, lineLength);
989 return TRUE;
990 }
991 // Empty line, continue.
992 }
993 }
994
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)995 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
996 UErrorCode &errorCode) {
997 int32_t length = fileLine.length();
998 int32_t i;
999 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
1000 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|'
1001 if(pipeIndex >= 0) {
1002 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1003 if(prefix.isEmpty()) {
1004 errln("empty prefix on line %d", (int)fileLineNumber);
1005 infoln(fileLine);
1006 errorCode = U_PARSE_ERROR;
1007 return;
1008 }
1009 start = pipeIndex + 1;
1010 } else {
1011 prefix.remove();
1012 }
1013 s = fileLine.tempSubStringBetween(start, i).unescape();
1014 if(s.isEmpty()) {
1015 errln("empty string on line %d", (int)fileLineNumber);
1016 infoln(fileLine);
1017 errorCode = U_PARSE_ERROR;
1018 return;
1019 }
1020 start = i;
1021 }
1022
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1023 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1024 Collation::Level relation;
1025 int32_t start;
1026 if(fileLine[0] == 0x3c) { // <
1027 UChar second = fileLine[1];
1028 start = 2;
1029 switch(second) {
1030 case 0x31: // <1
1031 relation = Collation::PRIMARY_LEVEL;
1032 break;
1033 case 0x32: // <2
1034 relation = Collation::SECONDARY_LEVEL;
1035 break;
1036 case 0x33: // <3
1037 relation = Collation::TERTIARY_LEVEL;
1038 break;
1039 case 0x34: // <4
1040 relation = Collation::QUATERNARY_LEVEL;
1041 break;
1042 case 0x63: // <c
1043 relation = Collation::CASE_LEVEL;
1044 break;
1045 case 0x69: // <i
1046 relation = Collation::IDENTICAL_LEVEL;
1047 break;
1048 default: // just <
1049 relation = Collation::NO_LEVEL;
1050 start = 1;
1051 break;
1052 }
1053 } else if(fileLine[0] == 0x3d) { // =
1054 relation = Collation::ZERO_LEVEL;
1055 start = 1;
1056 } else {
1057 start = 0;
1058 }
1059 if(start == 0 || !isSpace(fileLine[start])) {
1060 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1061 infoln(fileLine);
1062 errorCode.set(U_PARSE_ERROR);
1063 return Collation::NO_LEVEL;
1064 }
1065 start = skipSpaces(start);
1066 UnicodeString prefix;
1067 parseString(start, prefix, s, errorCode);
1068 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1069 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1070 infoln(fileLine);
1071 errorCode.set(U_PARSE_ERROR);
1072 return Collation::NO_LEVEL;
1073 }
1074 if(start < fileLine.length()) {
1075 errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1076 infoln(fileLine);
1077 errorCode.set(U_PARSE_ERROR);
1078 return Collation::NO_LEVEL;
1079 }
1080 return relation;
1081 }
1082
1083 static const struct {
1084 const char *name;
1085 UColAttribute attr;
1086 } attributes[] = {
1087 { "backwards", UCOL_FRENCH_COLLATION },
1088 { "alternate", UCOL_ALTERNATE_HANDLING },
1089 { "caseFirst", UCOL_CASE_FIRST },
1090 { "caseLevel", UCOL_CASE_LEVEL },
1091 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1092 { "strength", UCOL_STRENGTH },
1093 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1094 { "numeric", UCOL_NUMERIC_COLLATION }
1095 };
1096
1097 static const struct {
1098 const char *name;
1099 UColAttributeValue value;
1100 } attributeValues[] = {
1101 { "default", UCOL_DEFAULT },
1102 { "primary", UCOL_PRIMARY },
1103 { "secondary", UCOL_SECONDARY },
1104 { "tertiary", UCOL_TERTIARY },
1105 { "quaternary", UCOL_QUATERNARY },
1106 { "identical", UCOL_IDENTICAL },
1107 { "off", UCOL_OFF },
1108 { "on", UCOL_ON },
1109 { "shifted", UCOL_SHIFTED },
1110 { "non-ignorable", UCOL_NON_IGNORABLE },
1111 { "lower", UCOL_LOWER_FIRST },
1112 { "upper", UCOL_UPPER_FIRST }
1113 };
1114
parseAndSetAttribute(IcuTestErrorCode & errorCode)1115 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1116 // Parse attributes even if the Collator could not be created,
1117 // in order to report syntax errors.
1118 int32_t start = skipSpaces(1);
1119 int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1120 if(equalPos < 0) {
1121 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1122 parseAndSetReorderCodes(start + 7, errorCode);
1123 return;
1124 }
1125 errln("missing '=' on line %d", (int)fileLineNumber);
1126 infoln(fileLine);
1127 errorCode.set(U_PARSE_ERROR);
1128 return;
1129 }
1130
1131 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1132 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1133 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1134 UColReorderCode max;
1135 if(valueString == UNICODE_STRING("space", 5)) {
1136 max = UCOL_REORDER_CODE_SPACE;
1137 } else if(valueString == UNICODE_STRING("punct", 5)) {
1138 max = UCOL_REORDER_CODE_PUNCTUATION;
1139 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1140 max = UCOL_REORDER_CODE_SYMBOL;
1141 } else if(valueString == UNICODE_STRING("currency", 8)) {
1142 max = UCOL_REORDER_CODE_CURRENCY;
1143 } else {
1144 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1145 infoln(fileLine);
1146 errorCode.set(U_PARSE_ERROR);
1147 return;
1148 }
1149 if(coll != NULL) {
1150 coll->setMaxVariable(max, errorCode);
1151 if(errorCode.isFailure()) {
1152 errln("setMaxVariable() failed on line %d: %s",
1153 (int)fileLineNumber, errorCode.errorName());
1154 infoln(fileLine);
1155 return;
1156 }
1157 }
1158 fileLine.remove();
1159 return;
1160 }
1161
1162 UColAttribute attr;
1163 for(int32_t i = 0;; ++i) {
1164 if(i == UPRV_LENGTHOF(attributes)) {
1165 errln("invalid attribute name on line %d", (int)fileLineNumber);
1166 infoln(fileLine);
1167 errorCode.set(U_PARSE_ERROR);
1168 return;
1169 }
1170 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1171 attr = attributes[i].attr;
1172 break;
1173 }
1174 }
1175
1176 UColAttributeValue value;
1177 for(int32_t i = 0;; ++i) {
1178 if(i == UPRV_LENGTHOF(attributeValues)) {
1179 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1180 infoln(fileLine);
1181 errorCode.set(U_PARSE_ERROR);
1182 return;
1183 }
1184 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1185 value = attributeValues[i].value;
1186 break;
1187 }
1188 }
1189
1190 if(coll != NULL) {
1191 coll->setAttribute(attr, value, errorCode);
1192 if(errorCode.isFailure()) {
1193 errln("illegal attribute=value combination on line %d: %s",
1194 (int)fileLineNumber, errorCode.errorName());
1195 infoln(fileLine);
1196 return;
1197 }
1198 }
1199 fileLine.remove();
1200 }
1201
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1202 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1203 UVector32 reorderCodes(errorCode);
1204 while(start < fileLine.length()) {
1205 start = skipSpaces(start);
1206 int32_t limit = start;
1207 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1208 CharString name;
1209 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1210 int32_t code = CollationRuleParser::getReorderCode(name.data());
1211 if(code < 0) {
1212 if(uprv_stricmp(name.data(), "default") == 0) {
1213 code = UCOL_REORDER_CODE_DEFAULT; // -1
1214 } else {
1215 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1216 infoln(fileLine);
1217 errorCode.set(U_PARSE_ERROR);
1218 return;
1219 }
1220 }
1221 reorderCodes.addElement(code, errorCode);
1222 start = limit;
1223 }
1224 if(coll != NULL) {
1225 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1226 if(errorCode.isFailure()) {
1227 errln("setReorderCodes() failed on line %d: %s",
1228 (int)fileLineNumber, errorCode.errorName());
1229 infoln(fileLine);
1230 return;
1231 }
1232 }
1233 fileLine.remove();
1234 }
1235
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1236 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1237 UnicodeString rules;
1238 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1239 rules.append(fileLine.unescape());
1240 }
1241 if(errorCode.isFailure()) { return; }
1242 logln(rules);
1243
1244 UParseError parseError;
1245 UnicodeString reason;
1246 delete coll;
1247 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1248 if(coll == NULL) {
1249 errln("unable to allocate a new collator");
1250 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1251 return;
1252 }
1253 if(errorCode.isFailure()) {
1254 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1255 infoln(UnicodeString(" reason: ") + reason);
1256 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); }
1257 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1258 infoln(UnicodeString(" snippet: ...") +
1259 parseError.preContext + "(!)" + parseError.postContext + "...");
1260 }
1261 delete coll;
1262 coll = NULL;
1263 errorCode.reset();
1264 } else {
1265 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1266 UnicodeString(), reason);
1267 }
1268 }
1269
setRootCollator(IcuTestErrorCode & errorCode)1270 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1271 if(errorCode.isFailure()) { return; }
1272 delete coll;
1273 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1274 if(errorCode.isFailure()) {
1275 dataerrln("unable to create a root collator");
1276 return;
1277 }
1278 }
1279
setLocaleCollator(IcuTestErrorCode & errorCode)1280 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1281 if(errorCode.isFailure()) { return; }
1282 delete coll;
1283 coll = NULL;
1284 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant
1285 if(at >= 0) {
1286 fileLine.setCharAt(at, (UChar)0x2a); // *
1287 }
1288 CharString localeID;
1289 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1290 if(at >= 0) {
1291 localeID.data()[at - 9] = '@';
1292 }
1293 Locale locale(localeID.data());
1294 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1295 errln("invalid language tag on line %d", (int)fileLineNumber);
1296 infoln(fileLine);
1297 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1298 return;
1299 }
1300
1301 logln("creating a collator for locale ID %s", locale.getName());
1302 coll = Collator::createInstance(locale, errorCode);
1303 if(errorCode.isFailure()) {
1304 dataerrln("unable to create a collator for locale %s on line %d",
1305 locale.getName(), (int)fileLineNumber);
1306 infoln(fileLine);
1307 delete coll;
1308 coll = NULL;
1309 errorCode.reset();
1310 }
1311 }
1312
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1313 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1314 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1315 // In some sequences with Tibetan composite vowel signs,
1316 // even if the string passes the FCD check,
1317 // those composites must be decomposed.
1318 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1319 int32_t index = 0;
1320 while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1321 if(++index < s.length()) {
1322 UChar c = s[index];
1323 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1324 }
1325 }
1326 return FALSE;
1327 }
1328
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1329 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1330 CharString &dest, int32_t partSize,
1331 IcuTestErrorCode &errorCode) {
1332 if(errorCode.isFailure()) { return FALSE; }
1333 uint8_t part[32];
1334 U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1335 UCharIterator iter;
1336 uiter_setString(&iter, s, length);
1337 uint32_t state[2] = { 0, 0 };
1338 for(;;) {
1339 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1340 UBool done = partLength < partSize;
1341 if(done) {
1342 // At the end, append the next byte as well which should be 00.
1343 ++partLength;
1344 }
1345 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1346 if(done) {
1347 return errorCode.isSuccess();
1348 }
1349 }
1350 }
1351
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1352 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1353 const UChar *s, int32_t length,
1354 CollationKey &key, IcuTestErrorCode &errorCode) {
1355 if(errorCode.isFailure()) { return FALSE; }
1356 coll->getCollationKey(s, length, key, errorCode);
1357 if(errorCode.isFailure()) {
1358 infoln(fileTestName);
1359 errln("Collator(%s).getCollationKey() failed: %s",
1360 norm, errorCode.errorName());
1361 infoln(line);
1362 return FALSE;
1363 }
1364 int32_t keyLength;
1365 const uint8_t *keyBytes = key.getByteArray(keyLength);
1366 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1367 infoln(fileTestName);
1368 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1369 norm);
1370 infoln(line);
1371 infoln(printCollationKey(key));
1372 return FALSE;
1373 }
1374
1375 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1376 if(numLevels < UCOL_IDENTICAL) {
1377 ++numLevels;
1378 } else {
1379 numLevels = 5;
1380 }
1381 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1382 ++numLevels;
1383 }
1384 errorCode.assertSuccess();
1385 int32_t numLevelSeparators = 0;
1386 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1387 uint8_t b = keyBytes[i];
1388 if(b == 0) {
1389 infoln(fileTestName);
1390 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1391 infoln(line);
1392 infoln(printCollationKey(key));
1393 return FALSE;
1394 }
1395 if(b == 1) { ++numLevelSeparators; }
1396 }
1397 if(numLevelSeparators != (numLevels - 1)) {
1398 infoln(fileTestName);
1399 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1400 norm, (int)numLevelSeparators, (int)numLevels);
1401 infoln(line);
1402 infoln(printCollationKey(key));
1403 return FALSE;
1404 }
1405
1406 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1407 static const int32_t partSizes[] = { 32, 3, 1 };
1408 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1409 int32_t partSize = partSizes[psi];
1410 CharString parts;
1411 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1412 infoln(fileTestName);
1413 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1414 norm, (int)partSize, errorCode.errorName());
1415 infoln(line);
1416 return FALSE;
1417 }
1418 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1419 infoln(fileTestName);
1420 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1421 norm, (int)partSize);
1422 infoln(line);
1423 infoln(printCollationKey(key));
1424 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1425 return FALSE;
1426 }
1427 }
1428 return TRUE;
1429 }
1430
1431 /**
1432 * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1433 * Leaves key unchanged if s does not contain U+FFFE.
1434 * @return TRUE if the key was successfully changed
1435 */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1436 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1437 CollationKey &key, IcuTestErrorCode &errorCode) {
1438 if(errorCode.isFailure()) { return FALSE; }
1439 LocalMemory<uint8_t> mergedKey;
1440 int32_t mergedKeyLength = 0;
1441 int32_t mergedKeyCapacity = 0;
1442 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1443 int32_t segmentStart = 0;
1444 for(int32_t i = 0;;) {
1445 if(i == sLength) {
1446 if(segmentStart == 0) {
1447 // s does not contain any U+FFFE.
1448 return FALSE;
1449 }
1450 } else if(s[i] != 0xfffe) {
1451 ++i;
1452 continue;
1453 }
1454 // Get the sort key for another segment and merge it into mergedKey.
1455 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1456 CollationKey key2;
1457 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1458 int32_t key1Length, key2Length;
1459 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1460 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1461 uint8_t *dest;
1462 int32_t minCapacity = key1Length + key2Length;
1463 if(key1Length > 0) { --minCapacity; }
1464 if(minCapacity <= mergedKeyCapacity) {
1465 dest = mergedKey.getAlias();
1466 } else {
1467 if(minCapacity <= 200) {
1468 mergedKeyCapacity = 200;
1469 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1470 mergedKeyCapacity *= 2;
1471 } else {
1472 mergedKeyCapacity = minCapacity;
1473 }
1474 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1475 }
1476 U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1477 if(key1Length == 0) {
1478 // key2 is the sort key for the first segment.
1479 uprv_memcpy(dest, key2Bytes, key2Length);
1480 mergedKeyLength = key2Length;
1481 } else {
1482 mergedKeyLength =
1483 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1484 dest, mergedKeyCapacity);
1485 }
1486 if(i == sLength) { break; }
1487 segmentStart = ++i;
1488 }
1489 key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1490 return TRUE;
1491 }
1492
1493 namespace {
1494
1495 /**
1496 * Replaces unpaired surrogates with U+FFFD.
1497 * Returns s if no replacement was made, otherwise buffer.
1498 */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1499 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1500 int32_t i = 0;
1501 while(i < s.length()) {
1502 UChar32 c = s.char32At(i);
1503 if(U_IS_SURROGATE(c)) {
1504 if(buffer.length() < i) {
1505 buffer.append(s, buffer.length(), i - buffer.length());
1506 }
1507 buffer.append((UChar)0xfffd);
1508 }
1509 i += U16_LENGTH(c);
1510 }
1511 if(buffer.isEmpty()) {
1512 return s;
1513 }
1514 if(buffer.length() < i) {
1515 buffer.append(s, buffer.length(), i - buffer.length());
1516 }
1517 return buffer;
1518 }
1519
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1520 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1521 UCollationResult order, UBool collHasCaseLevel) {
1522 if(order == UCOL_EQUAL) {
1523 return Collation::NO_LEVEL;
1524 }
1525 int32_t prevKeyLength;
1526 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1527 int32_t keyLength;
1528 const uint8_t *bytes = key.getByteArray(keyLength);
1529 int32_t level = Collation::PRIMARY_LEVEL;
1530 for(int32_t i = 0;; ++i) {
1531 uint8_t b = prevBytes[i];
1532 if(b != bytes[i]) { break; }
1533 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1534 ++level;
1535 if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1536 ++level;
1537 }
1538 }
1539 }
1540 return level;
1541 }
1542
1543 }
1544
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1545 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1546 const UnicodeString &prevString, const UnicodeString &s,
1547 UCollationResult expectedOrder, Collation::Level expectedLevel,
1548 IcuTestErrorCode &errorCode) {
1549 if(errorCode.isFailure()) { return FALSE; }
1550
1551 // Get the sort keys first, for error debug output.
1552 CollationKey prevKey;
1553 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1554 prevKey, errorCode)) {
1555 return FALSE;
1556 }
1557 CollationKey key;
1558 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1559
1560 UCollationResult order = coll->compare(prevString, s, errorCode);
1561 if(order != expectedOrder || errorCode.isFailure()) {
1562 infoln(fileTestName);
1563 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1564 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1565 infoln(prevFileLine);
1566 infoln(fileLine);
1567 infoln(printCollationKey(prevKey));
1568 infoln(printCollationKey(key));
1569 return FALSE;
1570 }
1571 order = coll->compare(s, prevString, errorCode);
1572 if(order != -expectedOrder || errorCode.isFailure()) {
1573 infoln(fileTestName);
1574 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1575 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1576 infoln(prevFileLine);
1577 infoln(fileLine);
1578 infoln(printCollationKey(prevKey));
1579 infoln(printCollationKey(key));
1580 return FALSE;
1581 }
1582 // Test NUL-termination if the strings do not contain NUL characters.
1583 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1584 if(!containNUL) {
1585 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1586 if(order != expectedOrder || errorCode.isFailure()) {
1587 infoln(fileTestName);
1588 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1589 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1590 infoln(prevFileLine);
1591 infoln(fileLine);
1592 infoln(printCollationKey(prevKey));
1593 infoln(printCollationKey(key));
1594 return FALSE;
1595 }
1596 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1597 if(order != -expectedOrder || errorCode.isFailure()) {
1598 infoln(fileTestName);
1599 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1600 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1601 infoln(prevFileLine);
1602 infoln(fileLine);
1603 infoln(printCollationKey(prevKey));
1604 infoln(printCollationKey(key));
1605 return FALSE;
1606 }
1607 }
1608
1609 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1610 // Unpaired surrogates cannot be converted to UTF-8.
1611 // Create valid UTF-16 strings if necessary, and use those for
1612 // both the expected compare() result and for the input to compare(UTF-8).
1613 UnicodeString prevBuffer, sBuffer;
1614 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1615 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1616 std::string prevUTF8, sUTF8;
1617 UnicodeString(prevValid).toUTF8String(prevUTF8);
1618 UnicodeString(sValid).toUTF8String(sUTF8);
1619 UCollationResult expectedUTF8Order;
1620 if(&prevValid == &prevString && &sValid == &s) {
1621 expectedUTF8Order = expectedOrder;
1622 } else {
1623 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1624 }
1625
1626 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1627 if(order != expectedUTF8Order || errorCode.isFailure()) {
1628 infoln(fileTestName);
1629 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1630 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1631 infoln(prevFileLine);
1632 infoln(fileLine);
1633 infoln(printCollationKey(prevKey));
1634 infoln(printCollationKey(key));
1635 return FALSE;
1636 }
1637 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1638 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1639 infoln(fileTestName);
1640 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1641 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1642 infoln(prevFileLine);
1643 infoln(fileLine);
1644 infoln(printCollationKey(prevKey));
1645 infoln(printCollationKey(key));
1646 return FALSE;
1647 }
1648 // Test NUL-termination if the strings do not contain NUL characters.
1649 if(!containNUL) {
1650 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1651 if(order != expectedUTF8Order || errorCode.isFailure()) {
1652 infoln(fileTestName);
1653 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1654 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1655 infoln(prevFileLine);
1656 infoln(fileLine);
1657 infoln(printCollationKey(prevKey));
1658 infoln(printCollationKey(key));
1659 return FALSE;
1660 }
1661 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1662 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1663 infoln(fileTestName);
1664 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1665 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1666 infoln(prevFileLine);
1667 infoln(fileLine);
1668 infoln(printCollationKey(prevKey));
1669 infoln(printCollationKey(key));
1670 return FALSE;
1671 }
1672 }
1673
1674 UCharIterator leftIter;
1675 UCharIterator rightIter;
1676 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1677 uiter_setString(&rightIter, s.getBuffer(), s.length());
1678 order = coll->compare(leftIter, rightIter, errorCode);
1679 if(order != expectedOrder || errorCode.isFailure()) {
1680 infoln(fileTestName);
1681 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1682 "wrong order: %d != %d (%s)",
1683 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1684 infoln(prevFileLine);
1685 infoln(fileLine);
1686 infoln(printCollationKey(prevKey));
1687 infoln(printCollationKey(key));
1688 return FALSE;
1689 }
1690
1691 order = prevKey.compareTo(key, errorCode);
1692 if(order != expectedOrder || errorCode.isFailure()) {
1693 infoln(fileTestName);
1694 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1695 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1696 infoln(prevFileLine);
1697 infoln(fileLine);
1698 infoln(printCollationKey(prevKey));
1699 infoln(printCollationKey(key));
1700 return FALSE;
1701 }
1702 UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1703 int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1704 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1705 if(level != expectedLevel) {
1706 infoln(fileTestName);
1707 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1708 (int)fileLineNumber, norm, order, level, expectedLevel);
1709 infoln(prevFileLine);
1710 infoln(fileLine);
1711 infoln(printCollationKey(prevKey));
1712 infoln(printCollationKey(key));
1713 return FALSE;
1714 }
1715 }
1716
1717 // If either string contains U+FFFE, then their sort keys must compare the same as
1718 // the merged sort keys of each string's between-FFFE segments.
1719 //
1720 // It is not required that
1721 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1722 // only that those two methods yield the same order.
1723 //
1724 // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1725 if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1726 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1727 errorCode.isFailure()) {
1728 order = prevKey.compareTo(key, errorCode);
1729 if(order != expectedOrder || errorCode.isFailure()) {
1730 infoln(fileTestName);
1731 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1732 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1733 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1734 infoln(prevFileLine);
1735 infoln(fileLine);
1736 infoln(printCollationKey(prevKey));
1737 infoln(printCollationKey(key));
1738 return FALSE;
1739 }
1740 int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1741 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1742 if(mergedLevel != level) {
1743 infoln(fileTestName);
1744 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1745 "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1746 (int)fileLineNumber, norm, order, mergedLevel, level);
1747 infoln(prevFileLine);
1748 infoln(fileLine);
1749 infoln(printCollationKey(prevKey));
1750 infoln(printCollationKey(key));
1751 return FALSE;
1752 }
1753 }
1754 }
1755 return TRUE;
1756 }
1757
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1758 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1759 if(errorCode.isFailure()) { return; }
1760 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1761 UnicodeString prevString, s;
1762 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1763 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1764 // Parse the line even if it will be ignored (when we do not have a Collator)
1765 // in order to report syntax issues.
1766 Collation::Level relation = parseRelationAndString(s, errorCode);
1767 if(errorCode.isFailure()) {
1768 errorCode.reset();
1769 break;
1770 }
1771 if(coll == NULL) {
1772 // We were unable to create the Collator but continue with tests.
1773 // Ignore test data for this Collator.
1774 // The next Collator creation might work.
1775 continue;
1776 }
1777 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1778 Collation::Level expectedLevel = relation;
1779 s.getTerminatedBuffer(); // Ensure NUL-termination.
1780 UBool isOk = TRUE;
1781 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1782 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1783 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1784 expectedOrder, expectedLevel, errorCode);
1785 }
1786 if(isOk) {
1787 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1788 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1789 expectedOrder, expectedLevel, errorCode);
1790 }
1791 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1792 UnicodeString pn = nfd->normalize(prevString, errorCode);
1793 UnicodeString n = nfd->normalize(s, errorCode);
1794 pn.getTerminatedBuffer();
1795 n.getTerminatedBuffer();
1796 errorCode.assertSuccess();
1797 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1798 expectedOrder, expectedLevel, errorCode);
1799 }
1800 if(!isOk) {
1801 errorCode.reset(); // already reported
1802 }
1803 prevFileLine = fileLine;
1804 prevString = s;
1805 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1806 }
1807 }
1808
TestDataDriven()1809 void CollationTest::TestDataDriven() {
1810 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1811
1812 fcd = Normalizer2Factory::getFCDInstance(errorCode);
1813 nfd = Normalizer2::getNFDInstance(errorCode);
1814 if(errorCode.errDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1815 return;
1816 }
1817
1818 CharString path(getSourceTestData(errorCode), errorCode);
1819 path.appendPathPart("collationtest.txt", errorCode);
1820 const char *codePage = "UTF-8";
1821 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1822 if(errorCode.errIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1823 return;
1824 }
1825 // Read a new line if necessary.
1826 // Sub-parsers leave the first line set that they do not handle.
1827 while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1828 if(!isSectionStarter(fileLine[0])) {
1829 errln("syntax error on line %d", (int)fileLineNumber);
1830 infoln(fileLine);
1831 return;
1832 }
1833 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1834 fileTestName = fileLine;
1835 logln(fileLine);
1836 fileLine.remove();
1837 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1838 setRootCollator(errorCode);
1839 fileLine.remove();
1840 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1841 setLocaleCollator(errorCode);
1842 fileLine.remove();
1843 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1844 buildTailoring(f.getAlias(), errorCode);
1845 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1846 parseAndSetAttribute(errorCode);
1847 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1848 checkCompareStrings(f.getAlias(), errorCode);
1849 } else {
1850 errln("syntax error on line %d", (int)fileLineNumber);
1851 infoln(fileLine);
1852 return;
1853 }
1854 }
1855 }
1856
TestLongLocale()1857 void CollationTest::TestLongLocale() {
1858 IcuTestErrorCode errorCode(*this, "TestLongLocale");
1859 Locale longLocale("sie__1G_C_CEIE_CEZCX_CSUE_E_EIESZNI2_GB_LM_LMCSUE_LMCSX_"
1860 "LVARIANT_MMCSIE_STEU_SU1GCEIE_SU6G_SU6SU6G_U_UBGE_UC_"
1861 "UCEZCSI_UCIE_UZSIU_VARIANT_X@collation=bcs-ukvsz");
1862 LocalPointer<Collator> coll(Collator::createInstance(longLocale, errorCode));
1863 }
1864
1865 #endif // !UCONFIG_NO_COLLATION
1866