1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationtest.cpp
7 *
8 * created on: 2012apr27
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/coll.h"
17 #include "unicode/errorcode.h"
18 #include "unicode/localpointer.h"
19 #include "unicode/normalizer2.h"
20 #include "unicode/sortkey.h"
21 #include "unicode/std_string.h"
22 #include "unicode/strenum.h"
23 #include "unicode/tblcoll.h"
24 #include "unicode/uiter.h"
25 #include "unicode/uniset.h"
26 #include "unicode/unistr.h"
27 #include "unicode/usetiter.h"
28 #include "unicode/ustring.h"
29 #include "charstr.h"
30 #include "cmemory.h"
31 #include "collation.h"
32 #include "collationdata.h"
33 #include "collationfcd.h"
34 #include "collationiterator.h"
35 #include "collationroot.h"
36 #include "collationrootelements.h"
37 #include "collationruleparser.h"
38 #include "collationweights.h"
39 #include "cstring.h"
40 #include "intltest.h"
41 #include "normalizer2impl.h"
42 #include "ucbuf.h"
43 #include "uhash.h"
44 #include "uitercollationiterator.h"
45 #include "utf16collationiterator.h"
46 #include "utf8collationiterator.h"
47 #include "uvectr32.h"
48 #include "uvectr64.h"
49 #include "writesrc.h"
50
51 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
52
53 // TODO: Move to ucbuf.h
54 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
55
56 class CodePointIterator;
57
58 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
59
60 class CollationTest : public IntlTest {
61 public:
CollationTest()62 CollationTest()
63 : fcd(NULL), nfd(NULL),
64 fileLineNumber(0),
65 coll(NULL) {}
66
~CollationTest()67 ~CollationTest() {
68 delete coll;
69 }
70
71 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
72
73 void TestMinMax();
74 void TestImplicits();
75 void TestNulTerminated();
76 void TestIllegalUTF8();
77 void TestShortFCDData();
78 void TestFCD();
79 void TestCollationWeights();
80 void TestRootElements();
81 void TestTailoredElements();
82 void TestDataDriven();
83
84 private:
85 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
86 void checkAllocWeights(CollationWeights &cw,
87 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
88 int32_t someLength, int32_t minCount);
89
90 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
91 static UnicodeString printCollationKey(const CollationKey &key);
92
93 // Helpers & fields for data-driven test.
isCROrLF(UChar c)94 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)95 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)96 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
skipSpaces(int32_t i)97 int32_t skipSpaces(int32_t i) {
98 while(isSpace(fileLine[i])) { ++i; }
99 return i;
100 }
101
102 UBool readLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
103 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
104 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
105 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
106 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
107 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
108 void setRootCollator(IcuTestErrorCode &errorCode);
109 void setLocaleCollator(IcuTestErrorCode &errorCode);
110
111 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
112
113 UBool getSortKeyParts(const UChar *s, int32_t length,
114 CharString &dest, int32_t partSize,
115 IcuTestErrorCode &errorCode);
116 UBool getCollationKey(const char *norm, const UnicodeString &line,
117 const UChar *s, int32_t length,
118 CollationKey &key, IcuTestErrorCode &errorCode);
119 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
120 const UnicodeString &prevString, const UnicodeString &s,
121 UCollationResult expectedOrder, Collation::Level expectedLevel,
122 IcuTestErrorCode &errorCode);
123 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
124
125 const Normalizer2 *fcd, *nfd;
126 UnicodeString fileLine;
127 int32_t fileLineNumber;
128 UnicodeString fileTestName;
129 Collator *coll;
130 };
131
createCollationTest()132 extern IntlTest *createCollationTest() {
133 return new CollationTest();
134 }
135
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)136 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
137 if(exec) {
138 logln("TestSuite CollationTest: ");
139 }
140 TESTCASE_AUTO_BEGIN;
141 TESTCASE_AUTO(TestMinMax);
142 TESTCASE_AUTO(TestImplicits);
143 TESTCASE_AUTO(TestNulTerminated);
144 TESTCASE_AUTO(TestIllegalUTF8);
145 TESTCASE_AUTO(TestShortFCDData);
146 TESTCASE_AUTO(TestFCD);
147 TESTCASE_AUTO(TestCollationWeights);
148 TESTCASE_AUTO(TestRootElements);
149 TESTCASE_AUTO(TestTailoredElements);
150 TESTCASE_AUTO(TestDataDriven);
151 TESTCASE_AUTO_END;
152 }
153
TestMinMax()154 void CollationTest::TestMinMax() {
155 IcuTestErrorCode errorCode(*this, "TestMinMax");
156
157 setRootCollator(errorCode);
158 if(errorCode.isFailure()) {
159 errorCode.reset();
160 return;
161 }
162 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
163 if(rbc == NULL) {
164 errln("the root collator is not a RuleBasedCollator");
165 return;
166 }
167
168 static const UChar s[2] = { 0xfffe, 0xffff };
169 UVector64 ces(errorCode);
170 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
171 errorCode.assertSuccess();
172 if(ces.size() != 2) {
173 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
174 return;
175 }
176 int64_t ce = ces.elementAti(0);
177 int64_t expected =
178 ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) |
179 Collation::MERGE_SEPARATOR_LOWER32;
180 if(ce != expected) {
181 errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce);
182 }
183
184 ce = ces.elementAti(1);
185 expected = Collation::makeCE(Collation::MAX_PRIMARY);
186 if(ce != expected) {
187 errln("CE(U+ffff)=%04lx != max..", (long)ce);
188 }
189 }
190
TestImplicits()191 void CollationTest::TestImplicits() {
192 IcuTestErrorCode errorCode(*this, "TestImplicits");
193
194 const CollationData *cd = CollationRoot::getData(errorCode);
195 if(errorCode.logDataIfFailureAndReset("CollationRoot::getBaseData()")) {
196 return;
197 }
198
199 // Implicit primary weights should be assigned for the following sets,
200 // and sort in ascending order by set and then code point.
201 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
202 // core Han Unified Ideographs
203 UnicodeSet coreHan("[\\p{unified_ideograph}&"
204 "[\\p{Block=CJK_Unified_Ideographs}"
205 "\\p{Block=CJK_Compatibility_Ideographs}]]",
206 errorCode);
207 // all other Unified Han ideographs
208 UnicodeSet otherHan("[\\p{unified ideograph}-"
209 "[\\p{Block=CJK_Unified_Ideographs}"
210 "\\p{Block=CJK_Compatibility_Ideographs}]]",
211 errorCode);
212 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
213 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
214 if(errorCode.logIfFailureAndReset("UnicodeSet")) {
215 return;
216 }
217 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
218 UChar32 prev = 0;
219 uint32_t prevPrimary = 0;
220 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
221 for(int32_t i = 0; i < LENGTHOF(sets); ++i) {
222 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
223 while(iter->next()) {
224 UChar32 c = iter->getCodepoint();
225 UnicodeString s(c);
226 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
227 int64_t ce = ci.nextCE(errorCode);
228 int64_t ce2 = ci.nextCE(errorCode);
229 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
230 return;
231 }
232 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
233 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
234 continue;
235 }
236 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
237 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
238 (long)c, (long)(ce & 0xffffffff));
239 continue;
240 }
241 uint32_t primary = (uint32_t)(ce >> 32);
242 if(!(primary > prevPrimary)) {
243 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
244 (long)c, (long)primary, (long)prev, (long)prevPrimary);
245 }
246 prev = c;
247 prevPrimary = primary;
248 }
249 }
250 }
251
TestNulTerminated()252 void CollationTest::TestNulTerminated() {
253 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
254 const CollationData *data = CollationRoot::getData(errorCode);
255 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
256 return;
257 }
258
259 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
260
261 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
262 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
263 for(int32_t i = 0;; ++i) {
264 int64_t ce1 = ci1.nextCE(errorCode);
265 int64_t ce2 = ci2.nextCE(errorCode);
266 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
267 return;
268 }
269 if(ce1 != ce2) {
270 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
271 break;
272 }
273 if(ce1 == Collation::NO_CE) { break; }
274 }
275 }
276
TestIllegalUTF8()277 void CollationTest::TestIllegalUTF8() {
278 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
279
280 setRootCollator(errorCode);
281 if(errorCode.isFailure()) {
282 errorCode.reset();
283 return;
284 }
285 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
286
287 static const char *strings[] = {
288 // U+FFFD
289 "a\xef\xbf\xbdz",
290 // illegal byte sequences
291 "a\x80z", // trail byte
292 "a\xc1\x81z", // non-shortest form
293 "a\xe0\x82\x83z", // non-shortest form
294 "a\xed\xa0\x80z", // lead surrogate: would be U+D800
295 "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
296 "a\xf0\x8f\xbf\xbfz", // non-shortest form
297 "a\xf4\x90\x80\x80z" // out of range: would be U+110000
298 };
299
300 StringPiece fffd(strings[0]);
301 for(int32_t i = 1; i < LENGTHOF(strings); ++i) {
302 StringPiece illegal(strings[i]);
303 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
304 if(order != UCOL_EQUAL) {
305 errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
306 (int)i, order);
307 }
308 }
309 }
310
311 namespace {
312
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)313 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
314 for(UChar32 c = 0x10000; c < 0x110000;) {
315 UChar32 next = c + 0x400;
316 if(src.containsSome(c, next - 1)) {
317 dest.add(U16_LEAD(c));
318 }
319 c = next;
320 }
321 }
322
323 } // namespace
324
TestShortFCDData()325 void CollationTest::TestShortFCDData() {
326 // See CollationFCD class comments.
327 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
328 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
329 errorCode.assertSuccess();
330 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
331 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
332 UnicodeSet lccc; // actual
333 for(UChar32 c = 0; c <= 0xffff; ++c) {
334 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
335 }
336 UnicodeSet diff(expectedLccc);
337 diff.removeAll(lccc);
338 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
339 UnicodeString empty("[]");
340 UnicodeString diffString;
341 diff.toPattern(diffString, TRUE);
342 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
343 diff = lccc;
344 diff.removeAll(expectedLccc);
345 diff.toPattern(diffString, TRUE);
346 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
347
348 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
349 if (errorCode.isSuccess()) {
350 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
351 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
352 UnicodeSet tccc; // actual
353 for(UChar32 c = 0; c <= 0xffff; ++c) {
354 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
355 }
356 diff = expectedTccc;
357 diff.removeAll(tccc);
358 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
359 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
360 diff = tccc;
361 diff.removeAll(expectedTccc);
362 diff.toPattern(diffString, TRUE);
363 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
364 }
365 }
366
367 class CodePointIterator {
368 public:
CodePointIterator(const UChar32 * cp,int32_t length)369 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()370 void resetToStart() { pos = 0; }
next()371 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()372 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const373 int32_t getLength() const { return length; }
getIndex() const374 int getIndex() const { return (int)pos; }
375 private:
376 const UChar32 *cp;
377 int32_t length;
378 int32_t pos;
379 };
380
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)381 void CollationTest::checkFCD(const char *name,
382 CollationIterator &ci, CodePointIterator &cpi) {
383 IcuTestErrorCode errorCode(*this, "checkFCD");
384
385 // Iterate forward to the limit.
386 for(;;) {
387 UChar32 c1 = ci.nextCodePoint(errorCode);
388 UChar32 c2 = cpi.next();
389 if(c1 != c2) {
390 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
391 name, (long)c1, (long)c2, cpi.getIndex());
392 return;
393 }
394 if(c1 < 0) { break; }
395 }
396
397 // Iterate backward most of the way.
398 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
399 UChar32 c1 = ci.previousCodePoint(errorCode);
400 UChar32 c2 = cpi.previous();
401 if(c1 != c2) {
402 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
403 name, (long)c1, (long)c2, cpi.getIndex());
404 return;
405 }
406 }
407
408 // Forward again.
409 for(;;) {
410 UChar32 c1 = ci.nextCodePoint(errorCode);
411 UChar32 c2 = cpi.next();
412 if(c1 != c2) {
413 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
414 name, (long)c1, (long)c2, cpi.getIndex());
415 return;
416 }
417 if(c1 < 0) { break; }
418 }
419
420 // Iterate backward to the start.
421 for(;;) {
422 UChar32 c1 = ci.previousCodePoint(errorCode);
423 UChar32 c2 = cpi.previous();
424 if(c1 != c2) {
425 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
426 name, (long)c1, (long)c2, cpi.getIndex());
427 return;
428 }
429 if(c1 < 0) { break; }
430 }
431 }
432
TestFCD()433 void CollationTest::TestFCD() {
434 IcuTestErrorCode errorCode(*this, "TestFCD");
435 const CollationData *data = CollationRoot::getData(errorCode);
436 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
437 return;
438 }
439
440 // Input string, not FCD, NUL-terminated.
441 static const UChar s[] = {
442 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
443 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
444 0x327, 0x308, // ccc=202, 230
445 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
446 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
447 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
448 0xac01,
449 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
450 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
451 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
452 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
453 0x4e00, 0xf81,
454 0
455 };
456 // Expected code points.
457 static const UChar32 cp[] = {
458 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
459 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
460 0x1D15F, 0x1D16D,
461 0xac01,
462 0x63, 0x327, 0x1D165, 0x1D16D,
463 0x61,
464 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
465 0x4e00, 0xf71, 0xf80
466 };
467
468 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
469 if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
470 return;
471 }
472 CodePointIterator cpi(cp, LENGTHOF(cp));
473 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
474
475 #if U_HAVE_STD_STRING
476 cpi.resetToStart();
477 std::string utf8;
478 UnicodeString(s).toUTF8String(utf8);
479 FCDUTF8CollationIterator u8ci(data, FALSE,
480 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
481 if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
482 return;
483 }
484 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
485 #endif
486
487 cpi.resetToStart();
488 UCharIterator iter;
489 uiter_setString(&iter, s, LENGTHOF(s) - 1); // -1: without the terminating NUL
490 FCDUIterCollationIterator uici(data, FALSE, iter, 0);
491 if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
492 return;
493 }
494 checkFCD("FCDUIterCollationIterator", uici, cpi);
495 }
496
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)497 void CollationTest::checkAllocWeights(CollationWeights &cw,
498 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
499 int32_t someLength, int32_t minCount) {
500 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
501 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
502 (long)lowerLimit, (long)upperLimit, (long)n);
503 return;
504 }
505 uint32_t previous = lowerLimit;
506 int32_t count = 0; // number of weights that have someLength
507 for(int32_t i = 0; i < n; ++i) {
508 uint32_t w = cw.nextWeight();
509 if(w == 0xffffffff) {
510 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
511 "returns only %ld weights",
512 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
513 return;
514 }
515 if(!(previous < w && w < upperLimit)) {
516 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
517 "number %ld -> %lx not between %lx and %lx",
518 (long)lowerLimit, (long)upperLimit, (long)n,
519 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
520 return;
521 }
522 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
523 }
524 if(count < minCount) {
525 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
526 "returns only %ld < %ld weights of length %d",
527 (long)lowerLimit, (long)upperLimit, (long)n,
528 (long)count, (long)minCount, (int)someLength);
529 }
530 }
531
TestCollationWeights()532 void CollationTest::TestCollationWeights() {
533 CollationWeights cw;
534
535 // Non-compressible primaries use 254 second bytes 02..FF.
536 logln("CollationWeights.initForPrimary(non-compressible)");
537 cw.initForPrimary(FALSE);
538 // Expect 1 weight 11 and 254 weights 12xx.
539 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
540 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
541 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
542 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
543 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
544 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
545 // Expect 254^2=64516 three-byte weights.
546 // During computation, there should be 3 three-byte ranges
547 // 10ffff, 11xxxx, 120202.
548 // The middle one should be split 64515:1,
549 // and the newly-split-off range and the last ranged lengthened.
550 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
551 // Expect weights 1102 & 1103.
552 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
553 // Expect weights 102102 & 102103.
554 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
555
556 // Compressible primaries use 251 second bytes 04..FE.
557 logln("CollationWeights.initForPrimary(compressible)");
558 cw.initForPrimary(TRUE);
559 // Expect 1 weight 11 and 251 weights 12xx.
560 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
561 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
562 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
563 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
564 // Expect weights 1104 & 1105.
565 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
566 // Expect weights 102102 & 102103.
567 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
568
569 // Secondary and tertiary weights use only bytes 3 & 4.
570 logln("CollationWeights.initForSecondary()");
571 cw.initForSecondary();
572 // Expect weights fbxx and all four fc..ff.
573 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
574
575 logln("CollationWeights.initForTertiary()");
576 cw.initForTertiary();
577 // Expect weights 3dxx and both 3e & 3f.
578 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
579 }
580
581 namespace {
582
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)583 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
584 uint32_t p, uint32_t s, uint32_t ctq) {
585 uint32_t p1 = p >> 24;
586 uint32_t p2 = (p >> 16) & 0xff;
587 uint32_t p3 = (p >> 8) & 0xff;
588 uint32_t p4 = p & 0xff;
589 uint32_t s1 = s >> 8;
590 uint32_t s2 = s & 0xff;
591 // ctq = Case, Tertiary, Quaternary
592 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
593 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
594 uint32_t t1 = t >> 8;
595 uint32_t t2 = t & 0xff;
596 uint32_t q = ctq & Collation::QUATERNARY_MASK;
597 // No leading zero bytes.
598 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
599 return FALSE;
600 }
601 // No intermediate zero bytes.
602 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
603 return FALSE;
604 }
605 if(p2 != 0 && p3 == 0 && p4 != 0) {
606 return FALSE;
607 }
608 // Minimum & maximum lead bytes.
609 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
610 (s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) ||
611 (t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) {
612 return FALSE;
613 }
614 if(t1 != 0 && t1 > 0x3f) {
615 return FALSE;
616 }
617 if(c > 2) {
618 return FALSE;
619 }
620 // The valid byte range for the second primary byte depends on compressibility.
621 if(p2 != 0) {
622 if(data.isCompressibleLeadByte(p1)) {
623 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
624 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
625 return FALSE;
626 }
627 } else {
628 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
629 return FALSE;
630 }
631 }
632 }
633 // Other bytes just need to avoid the level separator.
634 // Trailing zeros are ok.
635 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
636 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
637 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
638 return FALSE;
639 }
640 // Well-formed CEs.
641 if(p == 0) {
642 if(s == 0) {
643 if(t == 0) {
644 // Completely ignorable CE.
645 // Quaternary CEs are not supported.
646 if(c != 0 || q != 0) {
647 return FALSE;
648 }
649 } else {
650 // Tertiary CE.
651 if(t < re.getTertiaryBoundary() || c != 2) {
652 return FALSE;
653 }
654 }
655 } else {
656 // Secondary CE.
657 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
658 return FALSE;
659 }
660 }
661 } else {
662 // Primary CE.
663 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
664 s >= re.getSecondaryBoundary()) {
665 return FALSE;
666 }
667 if(t == 0 || t >= re.getTertiaryBoundary()) {
668 return FALSE;
669 }
670 }
671 return TRUE;
672 }
673
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)674 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
675 uint32_t p = (uint32_t)(ce >> 32);
676 uint32_t secTer = (uint32_t)ce;
677 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
678 }
679
680 class RootElementsIterator {
681 public:
RootElementsIterator(const CollationData & root)682 RootElementsIterator(const CollationData &root)
683 : data(root),
684 elements(root.rootElements), length(root.rootElementsLength),
685 pri(0), secTer(0),
686 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
687
next()688 UBool next() {
689 if(index >= length) { return FALSE; }
690 uint32_t p = elements[index];
691 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
692 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
693 ++index;
694 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
695 return TRUE;
696 }
697 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
698 // End of a range, enumerate the primaries in the range.
699 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
700 p &= 0xffffff00;
701 if(pri == p) {
702 // Finished the range, return the next CE after it.
703 ++index;
704 return next();
705 }
706 U_ASSERT(pri < p);
707 // Return the next primary in this range.
708 UBool isCompressible = data.isCompressiblePrimary(pri);
709 if((pri & 0xffff) == 0) {
710 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
711 } else {
712 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
713 }
714 return TRUE;
715 }
716 // Simple primary CE.
717 ++index;
718 pri = p;
719 secTer = Collation::COMMON_SEC_AND_TER_CE;
720 return TRUE;
721 }
722
getPrimary() const723 uint32_t getPrimary() const { return pri; }
getSecTer() const724 uint32_t getSecTer() const { return secTer; }
725
726 private:
727 const CollationData &data;
728 const uint32_t *elements;
729 int32_t length;
730
731 uint32_t pri;
732 uint32_t secTer;
733 int32_t index;
734 };
735
736 } // namespace
737
TestRootElements()738 void CollationTest::TestRootElements() {
739 IcuTestErrorCode errorCode(*this, "TestRootElements");
740 const CollationData *root = CollationRoot::getData(errorCode);
741 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
742 return;
743 }
744 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
745 RootElementsIterator iter(*root);
746
747 // We check each root CE for validity,
748 // and we also verify that there is a tailoring gap between each two CEs.
749 CollationWeights cw1c; // compressible primary weights
750 CollationWeights cw1u; // uncompressible primary weights
751 CollationWeights cw2;
752 CollationWeights cw3;
753
754 cw1c.initForPrimary(TRUE);
755 cw1u.initForPrimary(FALSE);
756 cw2.initForSecondary();
757 cw3.initForTertiary();
758
759 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
760 // nor the special merge-separator CE for U+FFFE.
761 uint32_t prevPri = 0;
762 uint32_t prevSec = 0;
763 uint32_t prevTer = 0;
764 while(iter.next()) {
765 uint32_t pri = iter.getPrimary();
766 uint32_t secTer = iter.getSecTer();
767 // CollationRootElements CEs must have 0 case and quaternary bits.
768 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
769 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
770 (long)pri, (long)secTer);
771 }
772 uint32_t sec = secTer >> 16;
773 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
774 uint32_t ctq = ter;
775 if(pri == 0 && sec == 0 && ter != 0) {
776 // Tertiary CEs must have uppercase bits,
777 // but they are not stored in the CollationRootElements.
778 ctq |= 0x8000;
779 }
780 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
781 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
782 } else {
783 if(pri != prevPri) {
784 uint32_t newWeight = 0;
785 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
786 // There is currently no tailoring gap after primary ignorables,
787 // and we forbid tailoring after U+FFFD and U+FFFF.
788 } else if(root->isCompressiblePrimary(prevPri)) {
789 if(!cw1c.allocWeights(prevPri, pri, 1)) {
790 errln("no primary/compressible tailoring gap between %08lx and %08lx",
791 (long)prevPri, (long)pri);
792 } else {
793 newWeight = cw1c.nextWeight();
794 }
795 } else {
796 if(!cw1u.allocWeights(prevPri, pri, 1)) {
797 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
798 (long)prevPri, (long)pri);
799 } else {
800 newWeight = cw1u.nextWeight();
801 }
802 }
803 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
804 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
805 (long)prevPri, (long)newWeight, (long)pri);
806 }
807 } else if(sec != prevSec) {
808 uint32_t lowerLimit =
809 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
810 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
811 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
812 } else {
813 uint32_t newWeight = cw2.nextWeight();
814 if(!(prevSec < newWeight && newWeight < sec)) {
815 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
816 (long)lowerLimit, (long)newWeight, (long)sec);
817 }
818 }
819 } else if(ter != prevTer) {
820 uint32_t lowerLimit =
821 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
822 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
823 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
824 } else {
825 uint32_t newWeight = cw3.nextWeight();
826 if(!(prevTer < newWeight && newWeight < ter)) {
827 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
828 (long)lowerLimit, (long)newWeight, (long)ter);
829 }
830 }
831 } else {
832 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
833 }
834 }
835 prevPri = pri;
836 prevSec = sec;
837 prevTer = ter;
838 }
839 }
840
TestTailoredElements()841 void CollationTest::TestTailoredElements() {
842 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
843 const CollationData *root = CollationRoot::getData(errorCode);
844 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
845 return;
846 }
847 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
848
849 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
850 if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
851 return;
852 }
853 uhash_setKeyDeleter(prevLocales, uprv_free);
854 // TestRootElements() tests the root collator which does not have tailorings.
855 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
856 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
857 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
858
859 UVector64 ces(errorCode);
860 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
861 U_ASSERT(locales.isValid());
862 const char *localeID = "root";
863 do {
864 Locale locale(localeID);
865 LocalPointer<StringEnumeration> types(
866 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
867 errorCode.assertSuccess();
868 const char *type = NULL; // default type
869 do {
870 Locale localeWithType(locale);
871 if(type != NULL) {
872 localeWithType.setKeywordValue("collation", type, errorCode);
873 }
874 errorCode.assertSuccess();
875 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
876 if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
877 localeWithType.getName())) {
878 continue;
879 }
880 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
881 if(uhash_geti(prevLocales, actual.getName()) != 0) {
882 continue;
883 }
884 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
885 errorCode.assertSuccess();
886 logln("TestTailoredElements(): requested %s -> actual %s",
887 localeWithType.getName(), actual.getName());
888 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
889 if(rbc == NULL) {
890 continue;
891 }
892 // Note: It would be better to get tailored strings such that we can
893 // identify the prefix, and only get the CEs for the prefix+string,
894 // not also for the prefix.
895 // There is currently no API for that.
896 // It would help in an unusual case where a contraction starting in the prefix
897 // extends past its end, and we do not see the intended mapping.
898 // For example, for a mapping p|st, if there is also a contraction ps,
899 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
900 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
901 errorCode.assertSuccess();
902 UnicodeSetIterator iter(*tailored);
903 while(iter.next()) {
904 const UnicodeString &s = iter.getString();
905 ces.removeAllElements();
906 rbc->internalGetCEs(s, ces, errorCode);
907 errorCode.assertSuccess();
908 for(int32_t i = 0; i < ces.size(); ++i) {
909 int64_t ce = ces.elementAti(i);
910 if(!isValidCE(rootElements, *root, ce)) {
911 errln("invalid tailored CE %016llx at CE index %d from string:",
912 (long long)ce, (int)i);
913 infoln(prettify(s));
914 }
915 }
916 }
917 } while((type = types->next(NULL, errorCode)) != NULL);
918 } while((localeID = locales->next(NULL, errorCode)) != NULL);
919 uhash_close(prevLocales);
920 }
921
printSortKey(const uint8_t * p,int32_t length)922 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
923 UnicodeString s;
924 for(int32_t i = 0; i < length; ++i) {
925 if(i > 0) { s.append((UChar)0x20); }
926 uint8_t b = p[i];
927 if(b == 0) {
928 s.append((UChar)0x2e); // period
929 } else if(b == 1) {
930 s.append((UChar)0x7c); // vertical bar
931 } else {
932 appendHex(b, 2, s);
933 }
934 }
935 return s;
936 }
937
printCollationKey(const CollationKey & key)938 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
939 int32_t length;
940 const uint8_t *p = key.getByteArray(length);
941 return printSortKey(p, length);
942 }
943
readLine(UCHARBUF * f,IcuTestErrorCode & errorCode)944 UBool CollationTest::readLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
945 int32_t lineLength;
946 const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
947 if(line == NULL || errorCode.isFailure()) {
948 fileLine.remove();
949 return FALSE;
950 }
951 ++fileLineNumber;
952 // Strip trailing CR/LF, comments, and spaces.
953 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
954 if(comment != NULL) {
955 lineLength = (int32_t)(comment - line);
956 } else {
957 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
958 }
959 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
960 fileLine.setTo(FALSE, line, lineLength);
961 return TRUE;
962 }
963
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)964 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
965 UErrorCode &errorCode) {
966 int32_t length = fileLine.length();
967 int32_t i;
968 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
969 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|'
970 if(pipeIndex >= 0) {
971 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
972 if(prefix.isEmpty()) {
973 errln("empty prefix on line %d", (int)fileLineNumber);
974 infoln(fileLine);
975 errorCode = U_PARSE_ERROR;
976 return;
977 }
978 start = pipeIndex + 1;
979 } else {
980 prefix.remove();
981 }
982 s = fileLine.tempSubStringBetween(start, i).unescape();
983 if(s.isEmpty()) {
984 errln("empty string on line %d", (int)fileLineNumber);
985 infoln(fileLine);
986 errorCode = U_PARSE_ERROR;
987 return;
988 }
989 start = i;
990 }
991
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)992 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
993 Collation::Level relation;
994 int32_t start;
995 if(fileLine[0] == 0x3c) { // <
996 UChar second = fileLine[1];
997 start = 2;
998 switch(second) {
999 case 0x31: // <1
1000 relation = Collation::PRIMARY_LEVEL;
1001 break;
1002 case 0x32: // <2
1003 relation = Collation::SECONDARY_LEVEL;
1004 break;
1005 case 0x33: // <3
1006 relation = Collation::TERTIARY_LEVEL;
1007 break;
1008 case 0x34: // <4
1009 relation = Collation::QUATERNARY_LEVEL;
1010 break;
1011 case 0x63: // <c
1012 relation = Collation::CASE_LEVEL;
1013 break;
1014 case 0x69: // <i
1015 relation = Collation::IDENTICAL_LEVEL;
1016 break;
1017 default: // just <
1018 relation = Collation::NO_LEVEL;
1019 start = 1;
1020 break;
1021 }
1022 } else if(fileLine[0] == 0x3d) { // =
1023 relation = Collation::ZERO_LEVEL;
1024 start = 1;
1025 } else {
1026 start = 0;
1027 }
1028 if(start == 0 || !isSpace(fileLine[start])) {
1029 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1030 infoln(fileLine);
1031 errorCode.set(U_PARSE_ERROR);
1032 return Collation::NO_LEVEL;
1033 }
1034 start = skipSpaces(start);
1035 UnicodeString prefix;
1036 parseString(start, prefix, s, errorCode);
1037 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1038 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1039 infoln(fileLine);
1040 errorCode.set(U_PARSE_ERROR);
1041 return Collation::NO_LEVEL;
1042 }
1043 if(start < fileLine.length()) {
1044 errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1045 infoln(fileLine);
1046 errorCode.set(U_PARSE_ERROR);
1047 return Collation::NO_LEVEL;
1048 }
1049 return relation;
1050 }
1051
1052 static const struct {
1053 const char *name;
1054 UColAttribute attr;
1055 } attributes[] = {
1056 { "backwards", UCOL_FRENCH_COLLATION },
1057 { "alternate", UCOL_ALTERNATE_HANDLING },
1058 { "caseFirst", UCOL_CASE_FIRST },
1059 { "caseLevel", UCOL_CASE_LEVEL },
1060 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1061 { "strength", UCOL_STRENGTH },
1062 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1063 { "numeric", UCOL_NUMERIC_COLLATION }
1064 };
1065
1066 static const struct {
1067 const char *name;
1068 UColAttributeValue value;
1069 } attributeValues[] = {
1070 { "default", UCOL_DEFAULT },
1071 { "primary", UCOL_PRIMARY },
1072 { "secondary", UCOL_SECONDARY },
1073 { "tertiary", UCOL_TERTIARY },
1074 { "quaternary", UCOL_QUATERNARY },
1075 { "identical", UCOL_IDENTICAL },
1076 { "off", UCOL_OFF },
1077 { "on", UCOL_ON },
1078 { "shifted", UCOL_SHIFTED },
1079 { "non-ignorable", UCOL_NON_IGNORABLE },
1080 { "lower", UCOL_LOWER_FIRST },
1081 { "upper", UCOL_UPPER_FIRST }
1082 };
1083
parseAndSetAttribute(IcuTestErrorCode & errorCode)1084 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1085 int32_t start = skipSpaces(1);
1086 int32_t equalPos = fileLine.indexOf(0x3d);
1087 if(equalPos < 0) {
1088 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1089 parseAndSetReorderCodes(start + 7, errorCode);
1090 return;
1091 }
1092 errln("missing '=' on line %d", (int)fileLineNumber);
1093 infoln(fileLine);
1094 errorCode.set(U_PARSE_ERROR);
1095 return;
1096 }
1097
1098 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1099 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1100 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1101 UColReorderCode max;
1102 if(valueString == UNICODE_STRING("space", 5)) {
1103 max = UCOL_REORDER_CODE_SPACE;
1104 } else if(valueString == UNICODE_STRING("punct", 5)) {
1105 max = UCOL_REORDER_CODE_PUNCTUATION;
1106 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1107 max = UCOL_REORDER_CODE_SYMBOL;
1108 } else if(valueString == UNICODE_STRING("currency", 8)) {
1109 max = UCOL_REORDER_CODE_CURRENCY;
1110 } else {
1111 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1112 infoln(fileLine);
1113 errorCode.set(U_PARSE_ERROR);
1114 return;
1115 }
1116 coll->setMaxVariable(max, errorCode);
1117 if(errorCode.isFailure()) {
1118 errln("setMaxVariable() failed on line %d: %s",
1119 (int)fileLineNumber, errorCode.errorName());
1120 infoln(fileLine);
1121 return;
1122 }
1123 fileLine.remove();
1124 return;
1125 }
1126
1127 UColAttribute attr;
1128 for(int32_t i = 0;; ++i) {
1129 if(i == LENGTHOF(attributes)) {
1130 errln("invalid attribute name on line %d", (int)fileLineNumber);
1131 infoln(fileLine);
1132 errorCode.set(U_PARSE_ERROR);
1133 return;
1134 }
1135 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1136 attr = attributes[i].attr;
1137 break;
1138 }
1139 }
1140
1141 UColAttributeValue value;
1142 for(int32_t i = 0;; ++i) {
1143 if(i == LENGTHOF(attributeValues)) {
1144 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1145 infoln(fileLine);
1146 errorCode.set(U_PARSE_ERROR);
1147 return;
1148 }
1149 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1150 value = attributeValues[i].value;
1151 break;
1152 }
1153 }
1154
1155 coll->setAttribute(attr, value, errorCode);
1156 if(errorCode.isFailure()) {
1157 errln("illegal attribute=value combination on line %d: %s",
1158 (int)fileLineNumber, errorCode.errorName());
1159 infoln(fileLine);
1160 return;
1161 }
1162 fileLine.remove();
1163 }
1164
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1165 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1166 UVector32 reorderCodes(errorCode);
1167 while(start < fileLine.length()) {
1168 start = skipSpaces(start);
1169 int32_t limit = start;
1170 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1171 CharString name;
1172 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1173 int32_t code = CollationRuleParser::getReorderCode(name.data());
1174 if(code < -1) {
1175 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1176 infoln(fileLine);
1177 errorCode.set(U_PARSE_ERROR);
1178 return;
1179 }
1180 reorderCodes.addElement(code, errorCode);
1181 start = limit;
1182 }
1183 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1184 if(errorCode.isFailure()) {
1185 errln("setReorderCodes() failed on line %d: %s", (int)fileLineNumber, errorCode.errorName());
1186 infoln(fileLine);
1187 return;
1188 }
1189 fileLine.remove();
1190 }
1191
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1192 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1193 UnicodeString rules;
1194 while(readLine(f, errorCode)) {
1195 if(fileLine.isEmpty()) { continue; }
1196 if(isSectionStarter(fileLine[0])) { break; }
1197 rules.append(fileLine.unescape());
1198 }
1199 if(errorCode.isFailure()) { return; }
1200 logln(rules);
1201
1202 UParseError parseError;
1203 UnicodeString reason;
1204 delete coll;
1205 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1206 if(coll == NULL) {
1207 errln("unable to allocate a new collator");
1208 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1209 return;
1210 }
1211 if(errorCode.isFailure()) {
1212 errln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1213 infoln(UnicodeString(" reason: ") + reason);
1214 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); }
1215 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1216 infoln(UnicodeString(" snippet: ...") +
1217 parseError.preContext + "(!)" + parseError.postContext + "...");
1218 }
1219 } else {
1220 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1221 UnicodeString(), reason);
1222 }
1223 }
1224
setRootCollator(IcuTestErrorCode & errorCode)1225 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1226 if(errorCode.isFailure()) { return; }
1227 delete coll;
1228 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1229 if(errorCode.isFailure()) {
1230 dataerrln("unable to create a root collator");
1231 return;
1232 }
1233 }
1234
setLocaleCollator(IcuTestErrorCode & errorCode)1235 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1236 if(errorCode.isFailure()) { return; }
1237 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant
1238 if(at >= 0) {
1239 fileLine.setCharAt(at, (UChar)0x2a); // *
1240 }
1241 CharString localeID;
1242 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1243 if(at >= 0) {
1244 localeID.data()[at - 9] = '@';
1245 }
1246 Locale locale(localeID.data());
1247 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1248 errln("invalid language tag on line %d", (int)fileLineNumber);
1249 infoln(fileLine);
1250 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1251 return;
1252 }
1253
1254 logln("creating a collator for locale ID %s", locale.getName());
1255 Collator *newColl = Collator::createInstance(locale, errorCode);
1256 if(errorCode.isFailure()) {
1257 dataerrln("unable to create a collator for locale %s on line %d",
1258 locale.getName(), (int)fileLineNumber);
1259 infoln(fileLine);
1260 return;
1261 }
1262 delete coll;
1263 coll = newColl;
1264 }
1265
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1266 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1267 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1268 // In some sequences with Tibetan composite vowel signs,
1269 // even if the string passes the FCD check,
1270 // those composites must be decomposed.
1271 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1272 int32_t index = 0;
1273 while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1274 if(++index < s.length()) {
1275 UChar c = s[index];
1276 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1277 }
1278 }
1279 return FALSE;
1280 }
1281
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1282 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1283 CharString &dest, int32_t partSize,
1284 IcuTestErrorCode &errorCode) {
1285 if(errorCode.isFailure()) { return FALSE; }
1286 uint8_t part[32];
1287 U_ASSERT(partSize <= LENGTHOF(part));
1288 UCharIterator iter;
1289 uiter_setString(&iter, s, length);
1290 uint32_t state[2] = { 0, 0 };
1291 for(;;) {
1292 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1293 UBool done = partLength < partSize;
1294 if(done) {
1295 // At the end, append the next byte as well which should be 00.
1296 ++partLength;
1297 }
1298 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1299 if(done) {
1300 return errorCode.isSuccess();
1301 }
1302 }
1303 }
1304
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1305 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1306 const UChar *s, int32_t length,
1307 CollationKey &key, IcuTestErrorCode &errorCode) {
1308 if(errorCode.isFailure()) { return FALSE; }
1309 coll->getCollationKey(s, length, key, errorCode);
1310 if(errorCode.isFailure()) {
1311 infoln(fileTestName);
1312 errln("Collator(%s).getCollationKey() failed: %s",
1313 norm, errorCode.errorName());
1314 infoln(line);
1315 return FALSE;
1316 }
1317 int32_t keyLength;
1318 const uint8_t *keyBytes = key.getByteArray(keyLength);
1319 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1320 infoln(fileTestName);
1321 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1322 norm);
1323 infoln(line);
1324 infoln(printCollationKey(key));
1325 return FALSE;
1326 }
1327
1328 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1329 if(numLevels < UCOL_IDENTICAL) {
1330 ++numLevels;
1331 } else {
1332 numLevels = 5;
1333 }
1334 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1335 ++numLevels;
1336 }
1337 errorCode.assertSuccess();
1338 int32_t numLevelSeparators = 0;
1339 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1340 uint8_t b = keyBytes[i];
1341 if(b == 0) {
1342 infoln(fileTestName);
1343 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1344 infoln(line);
1345 infoln(printCollationKey(key));
1346 return FALSE;
1347 }
1348 if(b == 1) { ++numLevelSeparators; }
1349 }
1350 if(numLevelSeparators != (numLevels - 1)) {
1351 infoln(fileTestName);
1352 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1353 norm, (int)numLevelSeparators, (int)numLevels);
1354 infoln(line);
1355 infoln(printCollationKey(key));
1356 return FALSE;
1357 }
1358
1359 // If s contains U+FFFE, check that merged segments make the same key.
1360 LocalMemory<uint8_t> mergedKey;
1361 int32_t mergedKeyLength = 0;
1362 int32_t mergedKeyCapacity = 0;
1363 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1364 int32_t segmentStart = 0;
1365 for(int32_t i = 0;;) {
1366 if(i == sLength) {
1367 if(segmentStart == 0) {
1368 // s does not contain any U+FFFE.
1369 break;
1370 }
1371 } else if(s[i] != 0xfffe) {
1372 ++i;
1373 continue;
1374 }
1375 // Get the sort key for another segment and merge it into mergedKey.
1376 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1377 CollationKey key2;
1378 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1379 int32_t key1Length, key2Length;
1380 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1381 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1382 uint8_t *dest;
1383 int32_t minCapacity = key1Length + key2Length;
1384 if(key1Length > 0) { --minCapacity; }
1385 if(minCapacity <= mergedKeyCapacity) {
1386 dest = mergedKey.getAlias();
1387 } else {
1388 if(minCapacity <= 200) {
1389 mergedKeyCapacity = 200;
1390 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1391 mergedKeyCapacity *= 2;
1392 } else {
1393 mergedKeyCapacity = minCapacity;
1394 }
1395 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1396 }
1397 U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1398 if(key1Length == 0) {
1399 // key2 is the sort key for the first segment.
1400 uprv_memcpy(dest, key2Bytes, key2Length);
1401 mergedKeyLength = key2Length;
1402 } else {
1403 mergedKeyLength =
1404 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1405 dest, mergedKeyCapacity);
1406 }
1407 if(i == sLength) { break; }
1408 segmentStart = ++i;
1409 }
1410 if(segmentStart != 0 &&
1411 (mergedKeyLength != keyLength ||
1412 uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) {
1413 infoln(fileTestName);
1414 errln("Collator(%s).getCollationKey(with U+FFFE) != "
1415 "ucol_mergeSortkeys(segments)",
1416 norm);
1417 infoln(line);
1418 infoln(printCollationKey(key));
1419 infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength));
1420 return FALSE;
1421 }
1422
1423 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1424 static const int32_t partSizes[] = { 32, 3, 1 };
1425 for(int32_t psi = 0; psi < LENGTHOF(partSizes); ++psi) {
1426 int32_t partSize = partSizes[psi];
1427 CharString parts;
1428 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1429 infoln(fileTestName);
1430 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1431 norm, (int)partSize, errorCode.errorName());
1432 infoln(line);
1433 return FALSE;
1434 }
1435 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1436 infoln(fileTestName);
1437 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1438 norm, (int)partSize);
1439 infoln(line);
1440 infoln(printCollationKey(key));
1441 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1442 return FALSE;
1443 }
1444 }
1445 return TRUE;
1446 }
1447
1448 namespace {
1449
1450 /**
1451 * Replaces unpaired surrogates with U+FFFD.
1452 * Returns s if no replacement was made, otherwise buffer.
1453 */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1454 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1455 int32_t i = 0;
1456 while(i < s.length()) {
1457 UChar32 c = s.char32At(i);
1458 if(U_IS_SURROGATE(c)) {
1459 if(buffer.length() < i) {
1460 buffer.append(s, buffer.length(), i - buffer.length());
1461 }
1462 buffer.append((UChar)0xfffd);
1463 }
1464 i += U16_LENGTH(c);
1465 }
1466 if(buffer.isEmpty()) {
1467 return s;
1468 }
1469 if(buffer.length() < i) {
1470 buffer.append(s, buffer.length(), i - buffer.length());
1471 }
1472 return buffer;
1473 }
1474
1475 }
1476
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1477 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1478 const UnicodeString &prevString, const UnicodeString &s,
1479 UCollationResult expectedOrder, Collation::Level expectedLevel,
1480 IcuTestErrorCode &errorCode) {
1481 if(errorCode.isFailure()) { return FALSE; }
1482
1483 // Get the sort keys first, for error debug output.
1484 CollationKey prevKey;
1485 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1486 prevKey, errorCode)) {
1487 return FALSE;
1488 }
1489 CollationKey key;
1490 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1491
1492 UCollationResult order = coll->compare(prevString, s, errorCode);
1493 if(order != expectedOrder || errorCode.isFailure()) {
1494 infoln(fileTestName);
1495 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1496 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1497 infoln(prevFileLine);
1498 infoln(fileLine);
1499 infoln(printCollationKey(prevKey));
1500 infoln(printCollationKey(key));
1501 return FALSE;
1502 }
1503 order = coll->compare(s, prevString, errorCode);
1504 if(order != -expectedOrder || errorCode.isFailure()) {
1505 infoln(fileTestName);
1506 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1507 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1508 infoln(prevFileLine);
1509 infoln(fileLine);
1510 infoln(printCollationKey(prevKey));
1511 infoln(printCollationKey(key));
1512 return FALSE;
1513 }
1514 // Test NUL-termination if the strings do not contain NUL characters.
1515 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1516 if(!containNUL) {
1517 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1518 if(order != expectedOrder || errorCode.isFailure()) {
1519 infoln(fileTestName);
1520 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1521 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1522 infoln(prevFileLine);
1523 infoln(fileLine);
1524 infoln(printCollationKey(prevKey));
1525 infoln(printCollationKey(key));
1526 return FALSE;
1527 }
1528 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1529 if(order != -expectedOrder || errorCode.isFailure()) {
1530 infoln(fileTestName);
1531 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1532 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1533 infoln(prevFileLine);
1534 infoln(fileLine);
1535 infoln(printCollationKey(prevKey));
1536 infoln(printCollationKey(key));
1537 return FALSE;
1538 }
1539 }
1540
1541 #if U_HAVE_STD_STRING
1542 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1543 // Unpaired surrogates cannot be converted to UTF-8.
1544 // Create valid UTF-16 strings if necessary, and use those for
1545 // both the expected compare() result and for the input to compare(UTF-8).
1546 UnicodeString prevBuffer, sBuffer;
1547 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1548 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1549 std::string prevUTF8, sUTF8;
1550 UnicodeString(prevValid).toUTF8String(prevUTF8);
1551 UnicodeString(sValid).toUTF8String(sUTF8);
1552 UCollationResult expectedUTF8Order;
1553 if(&prevValid == &prevString && &sValid == &s) {
1554 expectedUTF8Order = expectedOrder;
1555 } else {
1556 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1557 }
1558
1559 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1560 if(order != expectedUTF8Order || errorCode.isFailure()) {
1561 infoln(fileTestName);
1562 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1563 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1564 infoln(prevFileLine);
1565 infoln(fileLine);
1566 infoln(printCollationKey(prevKey));
1567 infoln(printCollationKey(key));
1568 return FALSE;
1569 }
1570 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1571 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1572 infoln(fileTestName);
1573 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1574 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1575 infoln(prevFileLine);
1576 infoln(fileLine);
1577 infoln(printCollationKey(prevKey));
1578 infoln(printCollationKey(key));
1579 return FALSE;
1580 }
1581 // Test NUL-termination if the strings do not contain NUL characters.
1582 if(!containNUL) {
1583 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1584 if(order != expectedUTF8Order || errorCode.isFailure()) {
1585 infoln(fileTestName);
1586 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1587 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1588 infoln(prevFileLine);
1589 infoln(fileLine);
1590 infoln(printCollationKey(prevKey));
1591 infoln(printCollationKey(key));
1592 return FALSE;
1593 }
1594 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1595 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1596 infoln(fileTestName);
1597 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1598 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1599 infoln(prevFileLine);
1600 infoln(fileLine);
1601 infoln(printCollationKey(prevKey));
1602 infoln(printCollationKey(key));
1603 return FALSE;
1604 }
1605 }
1606 #endif
1607
1608 UCharIterator leftIter;
1609 UCharIterator rightIter;
1610 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1611 uiter_setString(&rightIter, s.getBuffer(), s.length());
1612 order = coll->compare(leftIter, rightIter, errorCode);
1613 if(order != expectedOrder || errorCode.isFailure()) {
1614 infoln(fileTestName);
1615 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1616 "wrong order: %d != %d (%s)",
1617 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1618 infoln(prevFileLine);
1619 infoln(fileLine);
1620 infoln(printCollationKey(prevKey));
1621 infoln(printCollationKey(key));
1622 return FALSE;
1623 }
1624
1625 order = prevKey.compareTo(key, errorCode);
1626 if(order != expectedOrder || errorCode.isFailure()) {
1627 infoln(fileTestName);
1628 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1629 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1630 infoln(prevFileLine);
1631 infoln(fileLine);
1632 infoln(printCollationKey(prevKey));
1633 infoln(printCollationKey(key));
1634 return FALSE;
1635 }
1636 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1637 int32_t prevKeyLength;
1638 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1639 int32_t keyLength;
1640 const uint8_t *bytes = key.getByteArray(keyLength);
1641 int32_t level = Collation::PRIMARY_LEVEL;
1642 for(int32_t i = 0;; ++i) {
1643 uint8_t b = prevBytes[i];
1644 if(b != bytes[i]) { break; }
1645 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1646 ++level;
1647 if(level == Collation::CASE_LEVEL &&
1648 coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_OFF) {
1649 ++level;
1650 }
1651 }
1652 }
1653 if(level != expectedLevel) {
1654 infoln(fileTestName);
1655 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1656 (int)fileLineNumber, norm, order, level, expectedLevel);
1657 infoln(prevFileLine);
1658 infoln(fileLine);
1659 infoln(printCollationKey(prevKey));
1660 infoln(printCollationKey(key));
1661 return FALSE;
1662 }
1663 }
1664 return TRUE;
1665 }
1666
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1667 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1668 if(errorCode.isFailure()) { return; }
1669 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1670 UnicodeString prevString, s;
1671 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1672 while(readLine(f, errorCode)) {
1673 if(fileLine.isEmpty()) { continue; }
1674 if(isSectionStarter(fileLine[0])) { break; }
1675 Collation::Level relation = parseRelationAndString(s, errorCode);
1676 if(errorCode.isFailure()) {
1677 errorCode.reset();
1678 break;
1679 }
1680 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1681 Collation::Level expectedLevel = relation;
1682 s.getTerminatedBuffer(); // Ensure NUL-termination.
1683 UBool isOk = TRUE;
1684 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1685 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1686 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1687 expectedOrder, expectedLevel, errorCode);
1688 }
1689 if(isOk) {
1690 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1691 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1692 expectedOrder, expectedLevel, errorCode);
1693 }
1694 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1695 UnicodeString pn = nfd->normalize(prevString, errorCode);
1696 UnicodeString n = nfd->normalize(s, errorCode);
1697 pn.getTerminatedBuffer();
1698 n.getTerminatedBuffer();
1699 errorCode.assertSuccess();
1700 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1701 expectedOrder, expectedLevel, errorCode);
1702 }
1703 if(!isOk) {
1704 errorCode.reset(); // already reported
1705 }
1706 prevFileLine = fileLine;
1707 prevString = s;
1708 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1709 }
1710 }
1711
TestDataDriven()1712 void CollationTest::TestDataDriven() {
1713 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1714
1715 fcd = Normalizer2Factory::getFCDInstance(errorCode);
1716 nfd = Normalizer2Factory::getNFDInstance(errorCode);
1717 if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1718 return;
1719 }
1720
1721 CharString path(getSourceTestData(errorCode), errorCode);
1722 path.appendPathPart("collationtest.txt", errorCode);
1723 const char *codePage = "UTF-8";
1724 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1725 if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1726 return;
1727 }
1728 while(errorCode.isSuccess()) {
1729 // Read a new line if necessary.
1730 // Sub-parsers leave the first line set that they do not handle.
1731 if(fileLine.isEmpty()) {
1732 if(!readLine(f.getAlias(), errorCode)) { break; }
1733 continue;
1734 }
1735 if(!isSectionStarter(fileLine[0])) {
1736 errln("syntax error on line %d", (int)fileLineNumber);
1737 infoln(fileLine);
1738 return;
1739 }
1740 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1741 fileTestName = fileLine;
1742 logln(fileLine);
1743 fileLine.remove();
1744 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1745 setRootCollator(errorCode);
1746 fileLine.remove();
1747 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1748 setLocaleCollator(errorCode);
1749 fileLine.remove();
1750 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1751 buildTailoring(f.getAlias(), errorCode);
1752 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1753 parseAndSetAttribute(errorCode);
1754 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1755 checkCompareStrings(f.getAlias(), errorCode);
1756 } else {
1757 errln("syntax error on line %d", (int)fileLineNumber);
1758 infoln(fileLine);
1759 return;
1760 }
1761 }
1762 }
1763
1764 #endif // !UCONFIG_NO_COLLATION
1765