1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2000-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 05/23/00 aliu Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "unicode/translit.h"
18 #include "rbt.h"
19 #include "unicode/calendar.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/normlzr.h"
23 #include "unicode/uchar.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/usetiter.h"
26 #include "unicode/putil.h"
27 #include "unicode/uversion.h"
28 #include "unicode/locid.h"
29 #include "unicode/ulocdata.h"
30 #include "unicode/utf8.h"
31 #include "unicode/utf16.h"
32 #include "putilimp.h"
33 #include "cmemory.h"
34 #include "transrt.h"
35 #include "testutil.h"
36 #include "uassert.h"
37 #include <string.h>
38 #include <stdio.h>
39
40 #define CASE(id,test) case id: \
41 name = #test; \
42 if (exec) { \
43 logln(#test "---"); \
44 logln((UnicodeString)""); \
45 UDate t = uprv_getUTCtime(); \
46 test(); \
47 t = uprv_getUTCtime() - t; \
48 logln((UnicodeString)#test " took " + t/U_MILLIS_PER_DAY + " seconds"); \
49 } \
50 break
51
52 #define EXHAUSTIVE(id,test) case id: \
53 if(quick==false){ \
54 name = #test; \
55 if (exec){ \
56 logln(#test "---"); \
57 logln((UnicodeString)""); \
58 test(); \
59 } \
60 }else{ \
61 name=""; \
62 } \
63 break
64 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)65 TransliteratorRoundTripTest::runIndexedTest(int32_t index, UBool exec,
66 const char* &name, char* /*par*/) {
67 switch (index) {
68 CASE(0, TestCyrillic);
69 // CASE(0,TestKana);
70 CASE(1,TestHiragana);
71 CASE(2,TestKatakana);
72 CASE(3,TestJamo);
73 CASE(4,TestHangul);
74 CASE(5,TestGreek);
75 CASE(6,TestGreekUNGEGN);
76 CASE(7,Testel);
77 CASE(8,TestDevanagariLatin);
78 CASE(9,TestInterIndic);
79 CASE(10, TestHebrew);
80 CASE(11, TestArabic);
81 CASE(12, TestHan);
82 default: name = ""; break;
83 }
84 }
85
86
87 //--------------------------------------------------------------------
88 // TransliteratorPointer
89 //--------------------------------------------------------------------
90
91 /**
92 * A transliterator pointer wrapper that deletes the contained
93 * pointer automatically when the wrapper goes out of scope.
94 * Sometimes called a "janitor" or "smart pointer".
95 */
96 class TransliteratorPointer {
97 Transliterator* t;
98 // disallowed:
99 TransliteratorPointer(const TransliteratorPointer& rhs);
100 TransliteratorPointer& operator=(const TransliteratorPointer& rhs);
101 public:
TransliteratorPointer(Transliterator * adopted)102 TransliteratorPointer(Transliterator* adopted) {
103 t = adopted;
104 }
~TransliteratorPointer()105 ~TransliteratorPointer() {
106 delete t;
107 }
operator ->()108 inline Transliterator* operator->() { return t; }
operator const Transliterator*() const109 inline operator const Transliterator*() const { return t; }
operator Transliterator*()110 inline operator Transliterator*() { return t; }
111 };
112
113 //--------------------------------------------------------------------
114 // Legal
115 //--------------------------------------------------------------------
116
117 class Legal {
118 public:
Legal()119 Legal() {}
~Legal()120 virtual ~Legal() {}
is(const UnicodeString &) const121 virtual UBool is(const UnicodeString& /*sourceString*/) const {return true;}
122 };
123
124 class LegalJamo : public Legal {
125 // any initial must be followed by a medial (or initial)
126 // any medial must follow an initial (or medial)
127 // any final must follow a medial (or final)
128 public:
LegalJamo()129 LegalJamo() {}
~LegalJamo()130 virtual ~LegalJamo() {}
131 virtual UBool is(const UnicodeString& sourceString) const override;
132 int getType(UChar c) const;
133 };
134
is(const UnicodeString & sourceString) const135 UBool LegalJamo::is(const UnicodeString& sourceString) const {
136 int t;
137 UnicodeString decomp;
138 UErrorCode ec = U_ZERO_ERROR;
139 Normalizer::decompose(sourceString, false, 0, decomp, ec);
140 if (U_FAILURE(ec)) {
141 return false;
142 }
143 for (int i = 0; i < decomp.length(); ++i) { // don't worry about surrogates
144 switch (getType(decomp.charAt(i))) {
145 case 0: t = getType(decomp.charAt(i+1));
146 if (t != 0 && t != 1) { return false; }
147 break;
148 case 1: t = getType(decomp.charAt(i-1));
149 if (t != 0 && t != 1) { return false; }
150 break;
151 case 2: t = getType(decomp.charAt(i-1));
152 if (t != 1 && t != 2) { return false; }
153 break;
154 }
155 }
156 return true;
157 }
158
getType(UChar c) const159 int LegalJamo::getType(UChar c) const {
160 if (0x1100 <= c && c <= 0x1112)
161 return 0;
162 else if (0x1161 <= c && c <= 0x1175)
163 return 1;
164 else if (0x11A8 <= c && c <= 0x11C2)
165 return 2;
166 return -1; // other
167 }
168
169 class LegalGreek : public Legal {
170 UBool full;
171 public:
LegalGreek(UBool _full)172 LegalGreek(UBool _full) { full = _full; }
~LegalGreek()173 virtual ~LegalGreek() {}
174
175 virtual UBool is(const UnicodeString& sourceString) const override;
176
177 static UBool isVowel(UChar c);
178
179 static UBool isRho(UChar c);
180 };
181
is(const UnicodeString & sourceString) const182 UBool LegalGreek::is(const UnicodeString& sourceString) const {
183 UnicodeString decomp;
184 UErrorCode ec = U_ZERO_ERROR;
185 Normalizer::decompose(sourceString, false, 0, decomp, ec);
186
187 // modern is simpler: don't care about anything but a grave
188 if (full == false) {
189 // A special case which is legal but should be
190 // excluded from round trip
191 // if (sourceString == UnicodeString("\\u039C\\u03C0", "")) {
192 // return false;
193 // }
194 for (int32_t i = 0; i < decomp.length(); ++i) {
195 UChar c = decomp.charAt(i);
196 // exclude all the accents
197 if (c == 0x0313 || c == 0x0314 || c == 0x0300 || c == 0x0302
198 || c == 0x0342 || c == 0x0345
199 ) return false;
200 }
201 return true;
202 }
203
204 // Legal greek has breathing marks IFF there is a vowel or RHO at the start
205 // IF it has them, it has exactly one.
206 // IF it starts with a RHO, then the breathing mark must come before the second letter.
207 // Since there are no surrogates in greek, don't worry about them
208 UBool firstIsVowel = false;
209 UBool firstIsRho = false;
210 UBool noLetterYet = true;
211 int32_t breathingCount = 0;
212 int32_t letterCount = 0;
213 for (int32_t i = 0; i < decomp.length(); ++i) {
214 UChar c = decomp.charAt(i);
215 if (u_isalpha(c)) {
216 ++letterCount;
217 if (noLetterYet) {
218 noLetterYet = false;
219 firstIsVowel = isVowel(c);
220 firstIsRho = isRho(c);
221 }
222 if (firstIsRho && letterCount == 2 && breathingCount == 0) {
223 return false;
224 }
225 }
226 if (c == 0x0313 || c == 0x0314) {
227 ++breathingCount;
228 }
229 }
230
231 if (firstIsVowel || firstIsRho) return breathingCount == 1;
232 return breathingCount == 0;
233 }
234
isVowel(UChar c)235 UBool LegalGreek::isVowel(UChar c) {
236 switch (c) {
237 case 0x03B1:
238 case 0x03B5:
239 case 0x03B7:
240 case 0x03B9:
241 case 0x03BF:
242 case 0x03C5:
243 case 0x03C9:
244 case 0x0391:
245 case 0x0395:
246 case 0x0397:
247 case 0x0399:
248 case 0x039F:
249 case 0x03A5:
250 case 0x03A9:
251 return true;
252 }
253 return false;
254 }
255
isRho(UChar c)256 UBool LegalGreek::isRho(UChar c) {
257 switch (c) {
258 case 0x03C1:
259 case 0x03A1:
260 return true;
261 }
262 return false;
263 }
264
265 namespace {
266
267 /**
268 * If abbreviated=true, returns a set which only a sampling of the original code points.
269 * density is the approximate total number of code points to returned for the entire set.
270 */
abbreviateSet(const UnicodeSet & set,bool abbreviated,int density,UnicodeSet & copy)271 const UnicodeSet &abbreviateSet(const UnicodeSet &set, bool abbreviated, int density,
272 UnicodeSet ©) {
273 if (!abbreviated) {
274 return set;
275 }
276 int32_t rangeCount = set.getRangeCount();
277 int32_t perRange = rangeCount;
278 if (perRange != 0) {
279 perRange = density / perRange;
280 }
281 const UnicodeSet *p = &set;
282 bool unchanged = true;
283 for (int32_t i = 0; i < rangeCount; ++i) {
284 int32_t start = set.getRangeStart(i);
285 int32_t end = set.getRangeEnd(i);
286 int32_t newEnd = start + perRange;
287 if (end > newEnd) {
288 if (unchanged) {
289 copy = set;
290 p = ©
291 unchanged = false;
292 }
293 copy.remove(newEnd + 1, end);
294 }
295 }
296 return *p;
297 }
298
299 } // namespace
300
301 //--------------------------------------------------------------------
302 // RTTest Interface
303 //--------------------------------------------------------------------
304
305 class RTTest : public IntlTest {
306
307 // PrintWriter out;
308
309 UnicodeString transliteratorID;
310 int32_t errorLimit;
311 int32_t errorCount;
312 int32_t pairLimit;
313 UnicodeSet sourceRange;
314 UnicodeSet targetRange;
315 UnicodeSet toSource;
316 UnicodeSet toTarget;
317 UnicodeSet roundtripExclusionsSet;
318 IntlTest* parent;
319 Legal* legalSource; // NOT owned
320 UnicodeSet badCharacters;
321
322 public:
323
324 /*
325 * create a test for the given script transliterator.
326 */
327 RTTest(const UnicodeString& transliteratorIDStr);
328
329 virtual ~RTTest();
330
331 void setErrorLimit(int32_t limit);
332
333 void setPairLimit(int32_t limit);
334
335 void test(const UnicodeString& sourceRange,
336 const UnicodeString& targetRange,
337 const char* roundtripExclusions,
338 IntlTest* parent,
339 UBool quick,
340 Legal* adoptedLegal,
341 int32_t density = 100);
342
343 private:
344
345 // Added to do better equality check.
346
347 static UBool isSame(const UnicodeString& a, const UnicodeString& b);
348
349 static UBool isCamel(const UnicodeString& a);
350
351 UBool checkIrrelevants(Transliterator *t, const UnicodeString& irrelevants);
352
353 void test2(UBool quick, int32_t density);
354
355 void logWrongScript(const UnicodeString& label,
356 const UnicodeString& from,
357 const UnicodeString& to);
358
359 void logNotCanonical(const UnicodeString& label,
360 const UnicodeString& from,
361 const UnicodeString& to,
362 const UnicodeString& fromCan,
363 const UnicodeString& toCan);
364
365 void logFails(const UnicodeString& label);
366
367 void logToRulesFails(const UnicodeString& label,
368 const UnicodeString& from,
369 const UnicodeString& to,
370 const UnicodeString& toCan);
371
372 void logRoundTripFailure(const UnicodeString& from,
373 const UnicodeString& toID,
374 const UnicodeString& to,
375 const UnicodeString& backID,
376 const UnicodeString& back);
377 };
378
379 //--------------------------------------------------------------------
380 // RTTest Implementation
381 //--------------------------------------------------------------------
382
383 /*
384 * create a test for the given script transliterator.
385 */
RTTest(const UnicodeString & transliteratorIDStr)386 RTTest::RTTest(const UnicodeString& transliteratorIDStr) {
387 transliteratorID = transliteratorIDStr;
388 errorLimit = 500;
389 errorCount = 0;
390 pairLimit = 0x10000;
391 }
392
~RTTest()393 RTTest::~RTTest() {
394 }
395
setErrorLimit(int32_t limit)396 void RTTest::setErrorLimit(int32_t limit) {
397 errorLimit = limit;
398 }
399
setPairLimit(int32_t limit)400 void RTTest::setPairLimit(int32_t limit) {
401 pairLimit = limit;
402 }
403
isSame(const UnicodeString & a,const UnicodeString & b)404 UBool RTTest::isSame(const UnicodeString& a, const UnicodeString& b) {
405 if (a == b) return true;
406 if (a.caseCompare(b, U_FOLD_CASE_DEFAULT)==0 && isCamel(a)) return true;
407 UnicodeString aa, bb;
408 UErrorCode ec = U_ZERO_ERROR;
409 Normalizer::decompose(a, false, 0, aa, ec);
410 Normalizer::decompose(b, false, 0, bb, ec);
411 if (aa == bb) return true;
412 if (aa.caseCompare(bb, U_FOLD_CASE_DEFAULT)==0 && isCamel(aa)) return true;
413 return false;
414 }
415
isCamel(const UnicodeString & a)416 UBool RTTest::isCamel(const UnicodeString& a) {
417 // see if string is of the form aB; e.g. lower, then upper or title
418 UChar32 cp;
419 UBool haveLower = false;
420 for (int32_t i = 0; i < a.length(); i += U16_LENGTH(cp)) {
421 cp = a.char32At(i);
422 int8_t t = u_charType(cp);
423 switch (t) {
424 case U_UPPERCASE_LETTER:
425 if (haveLower) return true;
426 break;
427 case U_TITLECASE_LETTER:
428 if (haveLower) return true;
429 // fall through, since second letter is lower.
430 U_FALLTHROUGH;
431 case U_LOWERCASE_LETTER:
432 haveLower = true;
433 break;
434 }
435 }
436 return false;
437 }
438
test(const UnicodeString & sourceRangeVal,const UnicodeString & targetRangeVal,const char * roundtripExclusions,IntlTest * logVal,UBool quickRt,Legal * adoptedLegal,int32_t density)439 void RTTest::test(const UnicodeString& sourceRangeVal,
440 const UnicodeString& targetRangeVal,
441 const char* roundtripExclusions,
442 IntlTest* logVal, UBool quickRt,
443 Legal* adoptedLegal,
444 int32_t density)
445 {
446
447 UErrorCode status = U_ZERO_ERROR;
448
449 this->parent = logVal;
450 this->legalSource = adoptedLegal;
451
452 UnicodeSet neverOk("[:Other:]", status);
453 UnicodeSet okAnyway("[^[:Letter:]]", status);
454
455 if (U_FAILURE(status)) {
456 parent->dataerrln("FAIL: Initializing UnicodeSet with [:Other:] or [^[:Letter:]] - Error: %s", u_errorName(status));
457 return;
458 }
459
460 this->sourceRange.clear();
461 this->sourceRange.applyPattern(sourceRangeVal, status);
462 if (U_FAILURE(status)) {
463 parent->errln("FAIL: UnicodeSet::applyPattern(" +
464 sourceRangeVal + ")");
465 return;
466 }
467 this->sourceRange.removeAll(neverOk);
468
469 this->targetRange.clear();
470 this->targetRange.applyPattern(targetRangeVal, status);
471 if (U_FAILURE(status)) {
472 parent->errln("FAIL: UnicodeSet::applyPattern(" +
473 targetRangeVal + ")");
474 return;
475 }
476 this->targetRange.removeAll(neverOk);
477
478 this->toSource.clear();
479 this->toSource.applyPattern(sourceRangeVal, status);
480 if (U_FAILURE(status)) {
481 parent->errln("FAIL: UnicodeSet::applyPattern(" +
482 sourceRangeVal + ")");
483 return;
484 }
485 this->toSource.addAll(okAnyway);
486
487 this->toTarget.clear();
488 this->toTarget.applyPattern(targetRangeVal, status);
489 if (U_FAILURE(status)) {
490 parent->errln("FAIL: UnicodeSet::applyPattern(" +
491 targetRangeVal + ")");
492 return;
493 }
494 this->toTarget.addAll(okAnyway);
495
496 this->roundtripExclusionsSet.clear();
497 if (roundtripExclusions != NULL && strlen(roundtripExclusions) > 0) {
498 this->roundtripExclusionsSet.applyPattern(UnicodeString(roundtripExclusions, -1, US_INV), status);
499 if (U_FAILURE(status)) {
500 parent->errln("FAIL: UnicodeSet::applyPattern(%s)", roundtripExclusions);
501 return;
502 }
503 }
504
505 badCharacters.clear();
506 badCharacters.applyPattern("[:Other:]", status);
507 if (U_FAILURE(status)) {
508 parent->errln("FAIL: UnicodeSet::applyPattern([:Other:])");
509 return;
510 }
511
512 test2(quickRt, density);
513
514 if (errorCount > 0) {
515 char str[100];
516 int32_t length = transliteratorID.extract(str, 100, NULL, status);
517 str[length] = 0;
518 parent->errln("FAIL: %s errors: %d %s", str, errorCount, (errorCount > errorLimit ? " (at least!)" : " ")); // + ", see " + logFileName);
519 } else {
520 char str[100];
521 int32_t length = transliteratorID.extract(str, 100, NULL, status);
522 str[length] = 0;
523 parent->logln("%s ok", str);
524 }
525 }
526
checkIrrelevants(Transliterator * t,const UnicodeString & irrelevants)527 UBool RTTest::checkIrrelevants(Transliterator *t,
528 const UnicodeString& irrelevants) {
529 for (int i = 0; i < irrelevants.length(); ++i) {
530 UChar c = irrelevants.charAt(i);
531 UnicodeString srcStr(c);
532 UnicodeString targ = srcStr;
533 t->transliterate(targ);
534 if (srcStr == targ) return true;
535 }
536 return false;
537 }
538
test2(UBool quickRt,int32_t density)539 void RTTest::test2(UBool quickRt, int32_t density) {
540
541 UnicodeString srcStr, targ, reverse;
542 UErrorCode status = U_ZERO_ERROR;
543 UParseError parseError ;
544 TransliteratorPointer sourceToTarget(
545 Transliterator::createInstance(transliteratorID, UTRANS_FORWARD, parseError,
546 status));
547 if ((Transliterator *)sourceToTarget == NULL) {
548 parent->dataerrln("FAIL: createInstance(" + transliteratorID +
549 ") returned NULL. Error: " + u_errorName(status)
550 + "\n\tpreContext : " + prettify(parseError.preContext)
551 + "\n\tpostContext : " + prettify(parseError.postContext));
552
553 return;
554 }
555 TransliteratorPointer targetToSource(sourceToTarget->createInverse(status));
556 if ((Transliterator *)targetToSource == NULL) {
557 parent->errln("FAIL: " + transliteratorID +
558 ".createInverse() returned NULL. Error:" + u_errorName(status)
559 + "\n\tpreContext : " + prettify(parseError.preContext)
560 + "\n\tpostContext : " + prettify(parseError.postContext));
561 return;
562 }
563
564 UnicodeSetIterator usi;
565 UnicodeSetIterator usi2;
566
567 parent->logln("Checking that at least one irrelevant character is not NFC'ed");
568 // string is from NFC_NO in the UCD
569 UnicodeString irrelevants = CharsToUnicodeString("\\u2000\\u2001\\u2126\\u212A\\u212B\\u2329");
570
571 if (checkIrrelevants(sourceToTarget, irrelevants) == false) {
572 logFails("Source-Target, irrelevants");
573 }
574 if (checkIrrelevants(targetToSource, irrelevants) == false) {
575 logFails("Target-Source, irrelevants");
576 }
577
578 if (!quickRt){
579 parent->logln("Checking that toRules works");
580 UnicodeString rules = "";
581
582 UParseError parseError;
583 rules = sourceToTarget->toRules(rules, true);
584 // parent->logln((UnicodeString)"toRules => " + rules);
585 TransliteratorPointer sourceToTarget2(Transliterator::createFromRules(
586 "s2t2", rules,
587 UTRANS_FORWARD,
588 parseError, status));
589 if (U_FAILURE(status)) {
590 parent->errln("FAIL: createFromRules %s\n", u_errorName(status));
591 return;
592 }
593
594 rules = targetToSource->toRules(rules, false);
595 TransliteratorPointer targetToSource2(Transliterator::createFromRules(
596 "t2s2", rules,
597 UTRANS_FORWARD,
598 parseError, status));
599 if (U_FAILURE(status)) {
600 parent->errln("FAIL: createFromRules %s\n", u_errorName(status));
601 return;
602 }
603
604 usi.reset(sourceRange);
605 for (;;) {
606 if (!usi.next() || usi.isString()) break;
607 UChar32 c = usi.getCodepoint();
608
609 UnicodeString srcStr((UChar32)c);
610 UnicodeString targ = srcStr;
611 sourceToTarget->transliterate(targ);
612 UnicodeString targ2 = srcStr;
613 sourceToTarget2->transliterate(targ2);
614 if (targ != targ2) {
615 logToRulesFails("Source-Target, toRules", srcStr, targ, targ2);
616 }
617 }
618
619 usi.reset(targetRange);
620 for (;;) {
621 if (!usi.next() || usi.isString()) break;
622 UChar32 c = usi.getCodepoint();
623
624 UnicodeString srcStr((UChar32)c);
625 UnicodeString targ = srcStr;
626 targetToSource->transliterate(targ);
627 UnicodeString targ2 = srcStr;
628 targetToSource2->transliterate(targ2);
629 if (targ != targ2) {
630 logToRulesFails("Target-Source, toRules", srcStr, targ, targ2);
631 }
632 }
633 }
634
635 parent->logln("Checking that all source characters convert to target - Singles");
636
637 UnicodeSet failSourceTarg;
638 usi.reset(sourceRange);
639 for (;;) {
640 if (!usi.next() || usi.isString()) break;
641 UChar32 c = usi.getCodepoint();
642
643 UnicodeString srcStr((UChar32)c);
644 UnicodeString targ = srcStr;
645 sourceToTarget->transliterate(targ);
646 if (toTarget.containsAll(targ) == false
647 || badCharacters.containsSome(targ) == true) {
648 UnicodeString targD;
649 Normalizer::decompose(targ, false, 0, targD, status);
650 if (U_FAILURE(status)) {
651 parent->errln("FAIL: Internal error during decomposition %s\n", u_errorName(status));
652 return;
653 }
654 if (toTarget.containsAll(targD) == false ||
655 badCharacters.containsSome(targD) == true) {
656 logWrongScript("Source-Target", srcStr, targ);
657 failSourceTarg.add(c);
658 continue;
659 }
660 }
661
662 UnicodeString cs2;
663 Normalizer::decompose(srcStr, false, 0, cs2, status);
664 if (U_FAILURE(status)) {
665 parent->errln("FAIL: Internal error during decomposition %s\n", u_errorName(status));
666 return;
667 }
668 UnicodeString targ2 = cs2;
669 sourceToTarget->transliterate(targ2);
670 if (targ != targ2) {
671 logNotCanonical("Source-Target", srcStr, targ,cs2, targ2);
672 }
673 }
674
675 parent->logln("Checking that all source characters convert to target - Doubles");
676
677 UnicodeSet sourceRangeMinusFailures(sourceRange);
678 sourceRangeMinusFailures.removeAll(failSourceTarg);
679
680 UnicodeSet copy, copy2;
681 usi.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density, copy));
682 for (;;) {
683 if (!usi.next() || usi.isString()) break;
684 UChar32 c = usi.getCodepoint();
685
686 usi2.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density, copy2));
687 for (;;) {
688 if (!usi2.next() || usi2.isString()) break;
689 UChar32 d = usi2.getCodepoint();
690
691 UnicodeString srcStr;
692 srcStr += (UChar32)c;
693 srcStr += (UChar32)d;
694 UnicodeString targ = srcStr;
695 sourceToTarget->transliterate(targ);
696 if (toTarget.containsAll(targ) == false ||
697 badCharacters.containsSome(targ) == true)
698 {
699 UnicodeString targD;
700 Normalizer::decompose(targ, false, 0, targD, status);
701 if (U_FAILURE(status)) {
702 parent->errln("FAIL: Internal error during decomposition %s\n", u_errorName(status));
703 return;
704 }
705 if (toTarget.containsAll(targD) == false ||
706 badCharacters.containsSome(targD) == true) {
707 logWrongScript("Source-Target", srcStr, targ);
708 continue;
709 }
710 }
711 UnicodeString cs2;
712 Normalizer::decompose(srcStr, false, 0, cs2, status);
713 if (U_FAILURE(status)) {
714 parent->errln("FAIL: Internal error during decomposition %s\n", u_errorName(status));
715 return;
716 }
717 UnicodeString targ2 = cs2;
718 sourceToTarget->transliterate(targ2);
719 if (targ != targ2) {
720 logNotCanonical("Source-Target", srcStr, targ, cs2,targ2);
721 }
722 }
723 }
724
725 parent->logln("Checking that target characters convert to source and back - Singles");
726
727 UnicodeSet failTargSource;
728 UnicodeSet failRound;
729
730 usi.reset(targetRange);
731 for (;;) {
732 if (!usi.next()) break;
733
734 if(usi.isString()){
735 srcStr = usi.getString();
736 }else{
737 srcStr = (UnicodeString)usi.getCodepoint();
738 }
739
740 UChar32 c = srcStr.char32At(0);
741
742 targ = srcStr;
743 targetToSource->transliterate(targ);
744 reverse = targ;
745 sourceToTarget->transliterate(reverse);
746
747 if (toSource.containsAll(targ) == false ||
748 badCharacters.containsSome(targ) == true) {
749 UnicodeString targD;
750 Normalizer::decompose(targ, false, 0, targD, status);
751 if (U_FAILURE(status)) {
752 parent->errln("FAIL: Internal error during decomposition%s\n", u_errorName(status));
753 return;
754 }
755 if (toSource.containsAll(targD) == false) {
756 logWrongScript("Target-Source", srcStr, targ);
757 failTargSource.add(c);
758 continue;
759 }
760 if (badCharacters.containsSome(targD) == true) {
761 logWrongScript("Target-Source*", srcStr, targ);
762 failTargSource.add(c);
763 continue;
764 }
765 }
766 if (isSame(srcStr, reverse) == false &&
767 roundtripExclusionsSet.contains(c) == false
768 && roundtripExclusionsSet.contains(srcStr)==false) {
769 logRoundTripFailure(srcStr,targetToSource->getID(), targ,sourceToTarget->getID(), reverse);
770 failRound.add(c);
771 continue;
772 }
773
774 UnicodeString targ2;
775 Normalizer::decompose(targ, false, 0, targ2, status);
776 if (U_FAILURE(status)) {
777 parent->errln("FAIL: Internal error during decomposition%s\n", u_errorName(status));
778 return;
779 }
780 UnicodeString reverse2 = targ2;
781 sourceToTarget->transliterate(reverse2);
782 if (reverse != reverse2) {
783 logNotCanonical("Target-Source", targ, reverse, targ2, reverse2);
784 }
785 }
786
787 parent->logln("Checking that target characters convert to source and back - Doubles");
788 int32_t count = 0;
789
790 UnicodeSet targetRangeMinusFailures(targetRange);
791 targetRangeMinusFailures.removeAll(failTargSource);
792 targetRangeMinusFailures.removeAll(failRound);
793
794 usi.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density, copy));
795 UnicodeString targ2;
796 UnicodeString reverse2;
797 UnicodeString targD;
798 for (;;) {
799 if (!usi.next() || usi.isString()) break;
800 UChar32 c = usi.getCodepoint();
801 if (++count > pairLimit) {
802 //throw new TestTruncated("Test truncated at " + pairLimit + " x 64k pairs");
803 parent->logln("");
804 parent->logln((UnicodeString)"Test truncated at " + pairLimit + " x 64k pairs");
805 return;
806 }
807
808 usi2.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density, copy2));
809 for (;;) {
810 if (!usi2.next() || usi2.isString())
811 break;
812 UChar32 d = usi2.getCodepoint();
813 srcStr.truncate(0); // empty the variable without construction/destruction
814 srcStr += c;
815 srcStr += d;
816
817 targ = srcStr;
818 targetToSource->transliterate(targ);
819 reverse = targ;
820 sourceToTarget->transliterate(reverse);
821
822 if (toSource.containsAll(targ) == false ||
823 badCharacters.containsSome(targ) == true)
824 {
825 targD.truncate(0); // empty the variable without construction/destruction
826 Normalizer::decompose(targ, false, 0, targD, status);
827 if (U_FAILURE(status)) {
828 parent->errln("FAIL: Internal error during decomposition%s\n",
829 u_errorName(status));
830 return;
831 }
832 if (toSource.containsAll(targD) == false
833 || badCharacters.containsSome(targD) == true)
834 {
835 logWrongScript("Target-Source", srcStr, targ);
836 continue;
837 }
838 }
839 if (isSame(srcStr, reverse) == false &&
840 roundtripExclusionsSet.contains(c) == false&&
841 roundtripExclusionsSet.contains(d) == false &&
842 roundtripExclusionsSet.contains(srcStr)== false)
843 {
844 logRoundTripFailure(srcStr,targetToSource->getID(), targ, sourceToTarget->getID(),reverse);
845 continue;
846 }
847
848 targ2.truncate(0); // empty the variable without construction/destruction
849 Normalizer::decompose(targ, false, 0, targ2, status);
850 if (U_FAILURE(status)) {
851 parent->errln("FAIL: Internal error during decomposition%s\n", u_errorName(status));
852 return;
853 }
854 reverse2 = targ2;
855 sourceToTarget->transliterate(reverse2);
856 if (reverse != reverse2) {
857 logNotCanonical("Target-Source", targ,reverse, targ2, reverse2);
858 }
859 }
860 }
861 parent->logln("");
862 }
863
logWrongScript(const UnicodeString & label,const UnicodeString & from,const UnicodeString & to)864 void RTTest::logWrongScript(const UnicodeString& label,
865 const UnicodeString& from,
866 const UnicodeString& to) {
867 parent->errln((UnicodeString)"FAIL " +
868 label + ": " +
869 from + "(" + TestUtility::hex(from) + ") => " +
870 to + "(" + TestUtility::hex(to) + ")");
871 ++errorCount;
872 }
873
logNotCanonical(const UnicodeString & label,const UnicodeString & from,const UnicodeString & to,const UnicodeString & fromCan,const UnicodeString & toCan)874 void RTTest::logNotCanonical(const UnicodeString& label,
875 const UnicodeString& from,
876 const UnicodeString& to,
877 const UnicodeString& fromCan,
878 const UnicodeString& toCan) {
879 parent->errln((UnicodeString)"FAIL (can.equiv)" +
880 label + ": " +
881 from + "(" + TestUtility::hex(from) + ") => " +
882 to + "(" + TestUtility::hex(to) + ")" +
883 fromCan + "(" + TestUtility::hex(fromCan) + ") => " +
884 toCan + " (" +
885 TestUtility::hex(toCan) + ")"
886 );
887 ++errorCount;
888 }
889
logFails(const UnicodeString & label)890 void RTTest::logFails(const UnicodeString& label) {
891 parent->errln((UnicodeString)"<br>FAIL " + label);
892 ++errorCount;
893 }
894
logToRulesFails(const UnicodeString & label,const UnicodeString & from,const UnicodeString & to,const UnicodeString & otherTo)895 void RTTest::logToRulesFails(const UnicodeString& label,
896 const UnicodeString& from,
897 const UnicodeString& to,
898 const UnicodeString& otherTo)
899 {
900 parent->errln((UnicodeString)"FAIL: " +
901 label + ": " +
902 from + "(" + TestUtility::hex(from) + ") => " +
903 to + "(" + TestUtility::hex(to) + ")" +
904 "!=" +
905 otherTo + " (" +
906 TestUtility::hex(otherTo) + ")"
907 );
908 ++errorCount;
909 }
910
911
logRoundTripFailure(const UnicodeString & from,const UnicodeString & toID,const UnicodeString & to,const UnicodeString & backID,const UnicodeString & back)912 void RTTest::logRoundTripFailure(const UnicodeString& from,
913 const UnicodeString& toID,
914 const UnicodeString& to,
915 const UnicodeString& backID,
916 const UnicodeString& back) {
917 if (legalSource->is(from) == false) return; // skip illegals
918
919 parent->errln((UnicodeString)"FAIL Roundtrip: " +
920 from + "(" + TestUtility::hex(from) + ") => " +
921 to + "(" + TestUtility::hex(to) + ") "+toID+" => " +
922 back + "(" + TestUtility::hex(back) + ") "+backID+" => ");
923 ++errorCount;
924 }
925
926 //--------------------------------------------------------------------
927 // Specific Tests
928 //--------------------------------------------------------------------
929
930 /*
931 Note: Unicode 3.2 added new Hiragana/Katakana characters:
932
933 3095..3096 ; 3.2 # [2] HIRAGANA LETTER SMALL KA..HIRAGANA LETTER SMALL KE
934 309F..30A0 ; 3.2 # [2] HIRAGANA DIGRAPH YORI..KATAKANA-HIRAGANA DOUBLE HYPHEN
935 30FF ; 3.2 # KATAKANA DIGRAPH KOTO
936 31F0..31FF ; 3.2 # [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
937
938 Unicode 5.2 added another Hiragana character:
939 1F200 ; 5.2 # SQUARE HIRAGANA HOKA
940
941 We will not add them to the rules until they are more supported (e.g. in fonts on Windows)
942 A bug has been filed to remind us to do this: #1979.
943 */
944
945 static const char KATAKANA[] = "[[[:katakana:][\\u30A1-\\u30FA\\u30FC]]-[\\u30FF\\u31F0-\\u31FF]-[:^age=5.2:]]";
946 static const char HIRAGANA[] = "[[[:hiragana:][\\u3040-\\u3094]]-[\\u3095-\\u3096\\u309F-\\u30A0\\U0001F200-\\U0001F2FF]-[:^age=5.2:]]";
947 static const char LENGTH[] = "[\\u30FC]";
948 static const char HALFWIDTH_KATAKANA[] = "[\\uFF65-\\uFF9D]";
949 static const char KATAKANA_ITERATION[] = "[\\u30FD\\u30FE]";
950 static const char HIRAGANA_ITERATION[] = "[\\u309D\\u309E]";
951 static const int32_t TEMP_MAX=256;
952
TestKana()953 void TransliteratorRoundTripTest::TestKana() {
954 RTTest test("Katakana-Hiragana");
955 Legal *legal = new Legal();
956 char temp[TEMP_MAX];
957 strcpy(temp, "[");
958 strcat(temp, HALFWIDTH_KATAKANA);
959 strcat(temp, LENGTH);
960 strcat(temp, "]");
961 test.test(KATAKANA, UnicodeString("[") + HIRAGANA + LENGTH + UnicodeString("]"),
962 temp,
963 this, quick, legal);
964 delete legal;
965 }
966
TestHiragana()967 void TransliteratorRoundTripTest::TestHiragana() {
968 RTTest test("Latin-Hiragana");
969 Legal *legal = new Legal();
970 test.test(UnicodeString("[a-zA-Z]", ""),
971 UnicodeString(HIRAGANA, -1, US_INV),
972 HIRAGANA_ITERATION, this, quick, legal);
973 delete legal;
974 }
975
TestKatakana()976 void TransliteratorRoundTripTest::TestKatakana() {
977 RTTest test("Latin-Katakana");
978 Legal *legal = new Legal();
979 char temp[TEMP_MAX];
980 strcpy(temp, "[");
981 strcat(temp, KATAKANA_ITERATION);
982 strcat(temp, HALFWIDTH_KATAKANA);
983 strcat(temp, "]");
984 test.test(UnicodeString("[a-zA-Z]", ""),
985 UnicodeString(KATAKANA, -1, US_INV),
986 temp,
987 this, quick, legal);
988 delete legal;
989 }
990
TestJamo()991 void TransliteratorRoundTripTest::TestJamo() {
992 RTTest t("Latin-Jamo");
993 Legal *legal = new LegalJamo();
994 t.test(UnicodeString("[a-zA-Z]", ""),
995 UnicodeString("[\\u1100-\\u1112 \\u1161-\\u1175 \\u11A8-\\u11C2]",
996 ""),
997 NULL, this, quick, legal);
998 delete legal;
999 }
1000
TestHangul()1001 void TransliteratorRoundTripTest::TestHangul() {
1002 RTTest t("Latin-Hangul");
1003 Legal *legal = new Legal();
1004 if (quick) t.setPairLimit(1000);
1005 t.test(UnicodeString("[a-zA-Z]", ""),
1006 UnicodeString("[\\uAC00-\\uD7A4]", ""),
1007 NULL, this, quick, legal, 1);
1008 delete legal;
1009 }
1010
1011
1012 #define ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
1013 if (U_FAILURE(status)) { \
1014 errcheckln(status, "error at file %s, line %d, status = %s", __FILE__, __LINE__, \
1015 u_errorName(status)); \
1016 return; \
1017 } \
1018 } UPRV_BLOCK_MACRO_END
1019
1020
writeStringInU8(FILE * out,const UnicodeString & s)1021 static void writeStringInU8(FILE *out, const UnicodeString &s) {
1022 int i;
1023 for (i=0; i<s.length(); i=s.moveIndex32(i, 1)) {
1024 UChar32 c = s.char32At(i);
1025 uint8_t bufForOneChar[10];
1026 UBool isError = false;
1027 int32_t destIdx = 0;
1028 U8_APPEND(bufForOneChar, destIdx, (int32_t)sizeof(bufForOneChar), c, isError);
1029 U_ASSERT(!isError);
1030 (void)isError;
1031 fwrite(bufForOneChar, 1, destIdx, out);
1032 }
1033 }
1034
1035
1036
1037
TestHan()1038 void TransliteratorRoundTripTest::TestHan() {
1039 UErrorCode status = U_ZERO_ERROR;
1040 LocalULocaleDataPointer uld(ulocdata_open("zh",&status));
1041 LocalUSetPointer USetExemplars(uset_openEmpty());
1042 assertTrue("", USetExemplars.isValid(), false, false, __FILE__, __LINE__);
1043 if (! USetExemplars.isValid()) return;
1044 ulocdata_getExemplarSet(uld.getAlias(), USetExemplars.getAlias(), 0, ULOCDATA_ES_STANDARD, &status);
1045 ASSERT_SUCCESS(status);
1046
1047 UnicodeString source;
1048 UChar32 c;
1049 int i;
1050 for (i=0; ;i++) {
1051 // Add all of the Chinese exemplar chars to the string "source".
1052 c = uset_charAt(USetExemplars.getAlias(), i);
1053 if (c == (UChar32)-1) {
1054 break;
1055 }
1056 source.append(c);
1057 }
1058
1059 // transform with Han translit
1060 Transliterator *hanTL = Transliterator::createInstance("Han-Latin", UTRANS_FORWARD, status);
1061 ASSERT_SUCCESS(status);
1062 UnicodeString target=source;
1063 hanTL->transliterate(target);
1064 // now verify that there are no Han characters left
1065 UnicodeSet allHan("[:han:]", status);
1066 ASSERT_SUCCESS(status);
1067 if (allHan.containsSome(target)) {
1068 errln("file %s, line %d, No Han must be left after Han-Latin transliteration",
1069 __FILE__, __LINE__);
1070 }
1071
1072 // check the pinyin translit
1073 Transliterator *pn = Transliterator::createInstance("Latin-NumericPinyin", UTRANS_FORWARD, status);
1074 ASSERT_SUCCESS(status);
1075 UnicodeString target2 = target;
1076 pn->transliterate(target2);
1077
1078 // verify that there are no marks
1079 Transliterator *nfd = Transliterator::createInstance("nfd", UTRANS_FORWARD, status);
1080 ASSERT_SUCCESS(status);
1081
1082 UnicodeString nfded = target2;
1083 nfd->transliterate(nfded);
1084 UnicodeSet allMarks(UNICODE_STRING_SIMPLE("[\\u0304\\u0301\\u030C\\u0300\\u0306]"), status); // look only for Pinyin tone marks, not all marks (there are some others in there)
1085 ASSERT_SUCCESS(status);
1086 assertFalse("NumericPinyin must contain no marks", allMarks.containsSome(nfded));
1087
1088 // verify roundtrip
1089 Transliterator *np = pn->createInverse(status);
1090 ASSERT_SUCCESS(status);
1091 UnicodeString target3 = target2;
1092 np->transliterate(target3);
1093 UBool roundtripOK = (target3.compare(target) == 0);
1094 assertTrue("NumericPinyin must roundtrip", roundtripOK);
1095 if (!roundtripOK) {
1096 const char *filename = "numeric-pinyin.log.txt";
1097 FILE *out = fopen(filename, "w");
1098 errln("Creating log file %s\n", filename);
1099 fprintf(out, "Pinyin: ");
1100 writeStringInU8(out, target);
1101 fprintf(out, "\nPinyin-Numeric-Pinyin: ");
1102 writeStringInU8(out, target2);
1103 fprintf(out, "\nNumeric-Pinyin-Pinyin: ");
1104 writeStringInU8(out, target3);
1105 fprintf(out, "\n");
1106 fclose(out);
1107 }
1108
1109 delete hanTL;
1110 delete pn;
1111 delete nfd;
1112 delete np;
1113 }
1114
1115
TestGreek()1116 void TransliteratorRoundTripTest::TestGreek() {
1117 logKnownIssue( "cldrbug:1911");
1118 // It is left in its current state as a regression test.
1119
1120 RTTest test("Latin-Greek");
1121 LegalGreek *legal = new LegalGreek(true);
1122
1123 test.test(UnicodeString("[a-zA-Z]", ""),
1124 UnicodeString("[\\u003B\\u00B7[[:Greek:]&[:Letter:]]-["
1125 "\\u1D26-\\u1D2A" // L& [5] GREEK LETTER SMALL CAPITAL GAMMA..GREEK LETTER SMALL CAPITAL PSI
1126 "\\u1D5D-\\u1D61" // Lm [5] MODIFIER LETTER SMALL BETA..MODIFIER LETTER SMALL CHI
1127 "\\u1D66-\\u1D6A" // L& [5] GREEK SUBSCRIPT SMALL LETTER BETA..GREEK SUBSCRIPT SMALL LETTER CHI
1128 "\\u03D7-\\u03EF" // \N{GREEK KAI SYMBOL}..\N{COPTIC SMALL LETTER DEI}
1129 "] & [:Age=4.0:]]",
1130
1131 //UnicodeString("[[\\u003B\\u00B7[:Greek:]-[\\u0374\\u0385\\u1fcd\\u1fce\\u1fdd\\u1fde\\u1fed-\\u1fef\\u1ffd\\u03D7-\\u03EF]]&[:Age=3.2:]]",
1132 ""),
1133 "[\\u00B5\\u037A\\u03D0-\\u03F5\\u03f9]", /* exclusions */
1134 this, quick, legal, 50);
1135
1136
1137 delete legal;
1138 }
1139
1140
TestGreekUNGEGN()1141 void TransliteratorRoundTripTest::TestGreekUNGEGN() {
1142 logKnownIssue( "cldrbug:1911");
1143 // It is left in its current state as a regression test.
1144
1145 RTTest test("Latin-Greek/UNGEGN");
1146 LegalGreek *legal = new LegalGreek(false);
1147
1148 test.test(UnicodeString("[a-zA-Z]", ""),
1149 UnicodeString("[\\u003B\\u00B7[[:Greek:]&[:Letter:]]-["
1150 "\\u1D26-\\u1D2A" // L& [5] GREEK LETTER SMALL CAPITAL GAMMA..GREEK LETTER SMALL CAPITAL PSI
1151 "\\u1D5D-\\u1D61" // Lm [5] MODIFIER LETTER SMALL BETA..MODIFIER LETTER SMALL CHI
1152 "\\u1D66-\\u1D6A" // L& [5] GREEK SUBSCRIPT SMALL LETTER BETA..GREEK SUBSCRIPT SMALL LETTER CHI
1153 "\\u03D7-\\u03EF" // \N{GREEK KAI SYMBOL}..\N{COPTIC SMALL LETTER DEI}
1154 "] & [:Age=4.0:]]",
1155 //UnicodeString("[[\\u003B\\u00B7[:Greek:]-[\\u0374\\u0385\\u1fce\\u1fde\\u03D7-\\u03EF]]&[:Age=3.2:]]",
1156 ""),
1157 "[\\u0385\\u00B5\\u037A\\u03D0-\\uFFFF {\\u039C\\u03C0}]", /* roundtrip exclusions */
1158 this, quick, legal);
1159
1160 delete legal;
1161 }
1162
Testel()1163 void TransliteratorRoundTripTest::Testel() {
1164 logKnownIssue( "cldrbug:1911");
1165 // It is left in its current state as a regression test.
1166
1167 RTTest test("Latin-el");
1168 LegalGreek *legal = new LegalGreek(false);
1169
1170 test.test(UnicodeString("[a-zA-Z]", ""),
1171 UnicodeString("[\\u003B\\u00B7[[:Greek:]&[:Letter:]]-["
1172 "\\u1D26-\\u1D2A" // L& [5] GREEK LETTER SMALL CAPITAL GAMMA..GREEK LETTER SMALL CAPITAL PSI
1173 "\\u1D5D-\\u1D61" // Lm [5] MODIFIER LETTER SMALL BETA..MODIFIER LETTER SMALL CHI
1174 "\\u1D66-\\u1D6A" // L& [5] GREEK SUBSCRIPT SMALL LETTER BETA..GREEK SUBSCRIPT SMALL LETTER CHI
1175 "\\u03D7-\\u03EF" // \N{GREEK KAI SYMBOL}..\N{COPTIC SMALL LETTER DEI}
1176 "] & [:Age=4.0:]]",
1177 //UnicodeString("[[\\u003B\\u00B7[:Greek:]-[\\u0374\\u0385\\u1fce\\u1fde\\u03D7-\\u03EF]]&[:Age=3.2:]]",
1178 ""),
1179 "[\\u00B5\\u037A\\u03D0-\\uFFFF {\\u039C\\u03C0}]", /* exclusions */
1180 this, quick, legal);
1181
1182
1183 delete legal;
1184 }
1185
1186
TestArabic()1187 void TransliteratorRoundTripTest::TestArabic() {
1188 UnicodeString ARABIC("[\\u060C\\u061B\\u061F\\u0621\\u0627-\\u063A\\u0641-\\u0655\\u0660-\\u066C\\u067E\\u0686\\u0698\\u06A4\\u06AD\\u06AF\\u06CB-\\u06CC\\u06F0-\\u06F9]", -1, US_INV);
1189 Legal *legal = new Legal();
1190 RTTest test("Latin-Arabic");
1191 test.test(UNICODE_STRING_SIMPLE("[a-zA-Z\\u02BE\\u02BF\\u207F]"), ARABIC, "[a-zA-Z\\u02BE\\u02BF\\u207F]",this, quick, legal); //
1192 delete legal;
1193 }
1194 class LegalHebrew : public Legal {
1195 private:
1196 UnicodeSet FINAL;
1197 UnicodeSet NON_FINAL;
1198 UnicodeSet LETTER;
1199 public:
1200 LegalHebrew(UErrorCode& error);
~LegalHebrew()1201 virtual ~LegalHebrew() {}
1202 virtual UBool is(const UnicodeString& sourceString) const override;
1203 };
1204
LegalHebrew(UErrorCode & error)1205 LegalHebrew::LegalHebrew(UErrorCode& error){
1206 FINAL.applyPattern(UNICODE_STRING_SIMPLE("[\\u05DA\\u05DD\\u05DF\\u05E3\\u05E5]"), error);
1207 NON_FINAL.applyPattern(UNICODE_STRING_SIMPLE("[\\u05DB\\u05DE\\u05E0\\u05E4\\u05E6]"), error);
1208 LETTER.applyPattern("[:letter:]", error);
1209 }
is(const UnicodeString & sourceString) const1210 UBool LegalHebrew::is(const UnicodeString& sourceString)const{
1211
1212 if (sourceString.length() == 0) return true;
1213 // don't worry about surrogates.
1214 for (int i = 0; i < sourceString.length(); ++i) {
1215 UChar ch = sourceString.charAt(i);
1216 UChar next = i+1 == sourceString.length() ? 0x0000 : sourceString.charAt(i);
1217 if (FINAL.contains(ch)) {
1218 if (LETTER.contains(next)) return false;
1219 } else if (NON_FINAL.contains(ch)) {
1220 if (!LETTER.contains(next)) return false;
1221 }
1222 }
1223 return true;
1224 }
TestHebrew()1225 void TransliteratorRoundTripTest::TestHebrew() {
1226 logKnownIssue( "cldrbug:1911");
1227 // It is left in its current state as a regression test.
1228
1229 //long start = System.currentTimeMillis();
1230 UErrorCode error = U_ZERO_ERROR;
1231 LegalHebrew* legal = new LegalHebrew(error);
1232 if(U_FAILURE(error)){
1233 dataerrln("Could not construct LegalHebrew object. Error: %s", u_errorName(error));
1234 return;
1235 }
1236 RTTest test("Latin-Hebrew");
1237 test.test(UNICODE_STRING_SIMPLE("[a-zA-Z\\u02BC\\u02BB]"), UNICODE_STRING_SIMPLE("[[[:hebrew:]-[\\u05BD\\uFB00-\\uFBFF]]&[:Age=4.0:]]"), "[\\u05F0\\u05F1\\u05F2]", this, quick, legal);
1238
1239 //showElapsed(start, "TestHebrew");
1240 delete legal;
1241 }
TestCyrillic()1242 void TransliteratorRoundTripTest::TestCyrillic() {
1243 RTTest test("Latin-Cyrillic");
1244 Legal *legal = new Legal();
1245
1246 test.test(UnicodeString("[a-zA-Z\\u0110\\u0111\\u02BA\\u02B9]", ""),
1247 UnicodeString("[[\\u0400-\\u045F] & [:Age=3.2:]]", ""), NULL, this, quick,
1248 legal);
1249
1250 delete legal;
1251 }
1252
1253
1254 // Inter-Indic Tests ----------------------------------
1255 class LegalIndic :public Legal{
1256 UnicodeSet vowelSignSet;
1257 UnicodeSet avagraha;
1258 UnicodeSet nukta;
1259 UnicodeSet virama;
1260 UnicodeSet sanskritStressSigns;
1261 UnicodeSet chandrabindu;
1262
1263 public:
1264 LegalIndic();
1265 virtual UBool is(const UnicodeString& sourceString) const override;
~LegalIndic()1266 virtual ~LegalIndic() {}
1267 };
is(const UnicodeString & sourceString) const1268 UBool LegalIndic::is(const UnicodeString& sourceString) const{
1269 int cp=sourceString.charAt(0);
1270
1271 // A vowel sign cannot be the first char
1272 if(vowelSignSet.contains(cp)){
1273 return false;
1274 }else if(avagraha.contains(cp)){
1275 return false;
1276 }else if(virama.contains(cp)){
1277 return false;
1278 }else if(nukta.contains(cp)){
1279 return false;
1280 }else if(sanskritStressSigns.contains(cp)){
1281 return false;
1282 }else if(chandrabindu.contains(cp) &&
1283 ((sourceString.length()>1) &&
1284 vowelSignSet.contains(sourceString.charAt(1)))){
1285 return false;
1286 }
1287 return true;
1288 }
LegalIndic()1289 LegalIndic::LegalIndic(){
1290 UErrorCode status = U_ZERO_ERROR;
1291 vowelSignSet.addAll( UnicodeSet("[\\u0902\\u0903\\u0904\\u093e-\\u094c\\u0962\\u0963]",status));/* Devanagari */
1292 vowelSignSet.addAll( UnicodeSet("[\\u0982\\u0983\\u09be-\\u09cc\\u09e2\\u09e3\\u09D7]",status));/* Bengali */
1293 vowelSignSet.addAll( UnicodeSet("[\\u0a02\\u0a03\\u0a3e-\\u0a4c\\u0a62\\u0a63\\u0a70\\u0a71]",status));/* Gurmukhi */
1294 vowelSignSet.addAll( UnicodeSet("[\\u0a82\\u0a83\\u0abe-\\u0acc\\u0ae2\\u0ae3]",status));/* Gujarati */
1295 vowelSignSet.addAll( UnicodeSet("[\\u0b02\\u0b03\\u0b3e-\\u0b4c\\u0b62\\u0b63\\u0b56\\u0b57]",status));/* Oriya */
1296 vowelSignSet.addAll( UnicodeSet("[\\u0b82\\u0b83\\u0bbe-\\u0bcc\\u0be2\\u0be3\\u0bd7]",status));/* Tamil */
1297 vowelSignSet.addAll( UnicodeSet("[\\u0c02\\u0c03\\u0c3e-\\u0c4c\\u0c62\\u0c63\\u0c55\\u0c56]",status));/* Telugu */
1298 vowelSignSet.addAll( UnicodeSet("[\\u0c82\\u0c83\\u0cbe-\\u0ccc\\u0ce2\\u0ce3\\u0cd5\\u0cd6]",status));/* Kannada */
1299 vowelSignSet.addAll( UnicodeSet("[\\u0d02\\u0d03\\u0d3e-\\u0d4c\\u0d62\\u0d63\\u0d57]",status));/* Malayalam */
1300
1301 avagraha.addAll(UnicodeSet("[\\u093d\\u09bd\\u0abd\\u0b3d\\u0cbd]",status));
1302 nukta.addAll(UnicodeSet("[\\u093c\\u09bc\\u0a3c\\u0abc\\u0b3c\\u0cbc]",status));
1303 virama.addAll(UnicodeSet("[\\u094d\\u09cd\\u0a4d\\u0acd\\u0b4d\\u0bcd\\u0c4d\\u0ccd\\u0d4d]",status));
1304 sanskritStressSigns.addAll(UnicodeSet("[\\u0951\\u0952\\u0953\\u0954\\u097d]",status));
1305 chandrabindu.addAll(UnicodeSet("[\\u0901\\u0981\\u0A81\\u0b01\\u0c01]",status));
1306
1307 }
1308
1309 static const char latinForIndic[] = "[['.0-9A-Za-z~\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"
1310 "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD\\u00FF-\\u010F"
1311 "\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137\\u0139-\\u013E\\u0143-\\u0148"
1312 "\\u014C-\\u0151\\u0154-\\u0165\\u0168-\\u017E\\u01A0-\\u01A1\\u01AF-\\u01B0"
1313 "\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01ED\\u01F0\\u01F4-\\u01F5\\u01F8-\\u01FB"
1314 "\\u0200-\\u021B\\u021E-\\u021F\\u0226-\\u0233\\u0294\\u0303-\\u0304\\u0306\\u0314-\\u0315"
1315 "\\u0325\\u040E\\u0419\\u0439\\u045E\\u04C1-\\u04C2\\u04D0-\\u04D1\\u04D6-\\u04D7"
1316 "\\u04E2-\\u04E3\\u04EE-\\u04EF\\u1E00-\\u1E99\\u1EA0-\\u1EF9\\u1F01\\u1F03\\u1F05"
1317 "\\u1F07\\u1F09\\u1F0B\\u1F0D\\u1F0F\\u1F11\\u1F13\\u1F15\\u1F19\\u1F1B\\u1F1D\\u1F21"
1318 "\\u1F23\\u1F25\\u1F27\\u1F29\\u1F2B\\u1F2D\\u1F2F\\u1F31\\u1F33\\u1F35\\u1F37\\u1F39"
1319 "\\u1F3B\\u1F3D\\u1F3F\\u1F41\\u1F43\\u1F45\\u1F49\\u1F4B\\u1F4D\\u1F51\\u1F53\\u1F55"
1320 "\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F\\u1F61\\u1F63\\u1F65\\u1F67\\u1F69\\u1F6B\\u1F6D"
1321 "\\u1F6F\\u1F81\\u1F83\\u1F85\\u1F87\\u1F89\\u1F8B\\u1F8D\\u1F8F\\u1F91\\u1F93\\u1F95"
1322 "\\u1F97\\u1F99\\u1F9B\\u1F9D\\u1F9F\\u1FA1\\u1FA3\\u1FA5\\u1FA7\\u1FA9\\u1FAB\\u1FAD"
1323 "\\u1FAF-\\u1FB1\\u1FB8-\\u1FB9\\u1FD0-\\u1FD1\\u1FD8-\\u1FD9\\u1FE0-\\u1FE1\\u1FE5"
1324 "\\u1FE8-\\u1FE9\\u1FEC\\u212A-\\u212B\\uE04D\\uE064]"
1325 "-[\\uE000-\\uE080 \\u01E2\\u01E3]& [[:latin:][:mark:]]]";
1326
TestDevanagariLatin()1327 void TransliteratorRoundTripTest::TestDevanagariLatin() {
1328 {
1329 UErrorCode status = U_ZERO_ERROR;
1330 UParseError parseError;
1331 TransliteratorPointer t1(Transliterator::createInstance("[\\u0964-\\u0965\\u0981-\\u0983\\u0985-\\u098C\\u098F-\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC\\u09BE-\\u09C4\\u09C7-\\u09C8\\u09CB-\\u09CD\\u09D7\\u09DC-\\u09DD\\u09DF-\\u09E3\\u09E6-\\u09FA];NFD;Bengali-InterIndic;InterIndic-Gujarati;NFC;",UTRANS_FORWARD, parseError, status));
1332 if((Transliterator *)t1 != NULL){
1333 TransliteratorPointer t2(t1->createInverse(status));
1334 if(U_FAILURE(status)){
1335 errln("FAIL: could not create the Inverse:-( \n");
1336 }
1337 }else {
1338 dataerrln("FAIL: could not create the transliterator. Error: %s\n", u_errorName(status));
1339 }
1340
1341 }
1342 RTTest test("Latin-Devanagari");
1343 Legal *legal = new LegalIndic();
1344 logKnownIssue( "cldrbug:1911");
1345 // It is left in its current state as a regression test.
1346
1347 test.test(UnicodeString(latinForIndic, ""),
1348 UnicodeString("[[[:Devanagari:][\\u094d][\\u0964\\u0965]]&[:Age=4.1:]-[\\u0970]]", ""), "[\\u0965\\u0904]", this, quick,
1349 legal, 50);
1350
1351 delete legal;
1352 }
1353
1354 /* Defined this way for HP/UX11CC :-( */
1355 static const int32_t INTER_INDIC_ARRAY_WIDTH = 4;
1356 static const char * const interIndicArray[] = {
1357
1358 "BENGALI-DEVANAGARI", "[:BENGALI:]", "[[:Devanagari:]-[\\u0970]]",
1359 "[\\u0904\\u0951-\\u0954\\u0943-\\u0949\\u094a\\u0962\\u0963\\u090D\\u090e\\u0911\\u0912\\u0929\\u0933\\u0934\\u0935\\u093d\\u0950\\u0958\\u0959\\u095a\\u095b\\u095e\\u097d]", /*roundtrip exclusions*/
1360
1361 "DEVANAGARI-BENGALI", "[[:Devanagari:]-[\\u0970]]", "[:BENGALI:]",
1362 "[\\u0951-\\u0954\\u0951-\\u0954\\u09D7\\u090D\\u090e\\u0911\\u0912\\u0929\\u0933\\u0934\\u0935\\u093d\\u0950\\u0958\\u0959\\u095a\\u095b\\u095e\\u09f0\\u09f1\\u09f2-\\u09fa\\u09ce]", /*roundtrip exclusions*/
1363
1364 "GURMUKHI-DEVANAGARI", "[:GURMUKHI:]", "[[:Devanagari:]-[\\u0970]]",
1365 "[\\u0904\\u0901\\u0902\\u0936\\u0933\\u0951-\\u0954\\u0902\\u0903\\u0943-\\u0949\\u094a\\u0962\\u0963\\u090B\\u090C\\u090D\\u090e\\u0911\\u0912\\u0934\\u0937\\u093D\\u0950\\u0960\\u0961\\u097d]", /*roundtrip exclusions*/
1366
1367 "DEVANAGARI-GURMUKHI", "[[:Devanagari:]-[\\u0970]]", "[:GURMUKHI:]",
1368 "[\\u0904\\u0A02\\u0946\\u0A5C\\u0951-\\u0954\\u0A70\\u0A71\\u090B\\u090C\\u090D\\u090e\\u0911\\u0912\\u0934\\u0937\\u093D\\u0950\\u0960\\u0961\\u0a72\\u0a73\\u0a74]", /*roundtrip exclusions*/
1369
1370 "GUJARATI-DEVANAGARI", "[:GUJARATI:]", "[[:Devanagari:]-[\\u0970]]",
1371 "[\\u0946\\u094A\\u0962\\u0963\\u0951-\\u0954\\u0961\\u090c\\u090e\\u0912\\u097d]", /*roundtrip exclusions*/
1372
1373 "DEVANAGARI-GUJARATI", "[[:Devanagari:]-[\\u0970]]", "[:GUJARATI:]",
1374 "[\\u0951-\\u0954\\u0961\\u090c\\u090e\\u0912]", /*roundtrip exclusions*/
1375
1376 "ORIYA-DEVANAGARI", "[:ORIYA:]", "[[:Devanagari:]-[\\u0970]]",
1377 "[\\u0904\\u0943-\\u094a\\u0962\\u0963\\u0951-\\u0954\\u0950\\u090D\\u090e\\u0912\\u0911\\u0931\\u0935\\u097d]", /*roundtrip exclusions*/
1378
1379 "DEVANAGARI-ORIYA", "[[:Devanagari:]-[\\u0970]]", "[:ORIYA:]",
1380 "[\\u0b5f\\u0b56\\u0b57\\u0b70\\u0b71\\u0950\\u090D\\u090e\\u0912\\u0911\\u0931]", /*roundtrip exclusions*/
1381
1382 "Tamil-DEVANAGARI", "[:tamil:]", "[[:Devanagari:]-[\\u0970]]",
1383 "[\\u0901\\u0904\\u093c\\u0943-\\u094a\\u0951-\\u0954\\u0962\\u0963\\u090B\\u090C\\u090D\\u0911\\u0916\\u0917\\u0918\\u091B\\u091D\\u0920\\u0921\\u0922\\u0925\\u0926\\u0927\\u092B\\u092C\\u092D\\u0936\\u093d\\u0950[\\u0958-\\u0961]\\u097d]", /*roundtrip exclusions*/
1384
1385 "DEVANAGARI-Tamil", "[[:Devanagari:]-[\\u0970]]", "[:tamil:]",
1386 "[\\u0bd7\\u0BF0\\u0BF1\\u0BF2]", /*roundtrip exclusions*/
1387
1388 "Telugu-DEVANAGARI", "[:telugu:]", "[[:Devanagari:]-[\\u0970]]",
1389 "[\\u0904\\u093c\\u0950\\u0945\\u0949\\u0951-\\u0954\\u0962\\u0963\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]\\u097d]", /*roundtrip exclusions*/
1390
1391 "DEVANAGARI-TELUGU", "[[:Devanagari:]-[\\u0970]]", "[:TELUGU:]",
1392 "[\\u0c55\\u0c56\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]]", /*roundtrip exclusions*/
1393
1394 "KANNADA-DEVANAGARI", "[:KANNADA:]", "[[:Devanagari:]-[\\u0970]]",
1395 "[\\u0901\\u0904\\u0946\\u093c\\u0950\\u0945\\u0949\\u0951-\\u0954\\u0962\\u0963\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]\\u097d]", /*roundtrip exclusions*/
1396
1397 "DEVANAGARI-KANNADA", "[[:Devanagari:]-[\\u0970]]", "[:KANNADA:]",
1398 "[{\\u0cb0\\u0cbc}{\\u0cb3\\u0cbc}\\u0cde\\u0cd5\\u0cd6\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]]", /*roundtrip exclusions*/
1399
1400 "MALAYALAM-DEVANAGARI", "[:MALAYALAM:]", "[[:Devanagari:]-[\\u0970]]",
1401 "[\\u0901\\u0904\\u094a\\u094b\\u094c\\u093c\\u0950\\u0944\\u0945\\u0949\\u0951-\\u0954\\u0962\\u0963\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]\\u097d]", /*roundtrip exclusions*/
1402
1403 "DEVANAGARI-MALAYALAM", "[[:Devanagari:]-[\\u0970]]", "[:MALAYALAM:]",
1404 "[\\u0d4c\\u0d57\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]]", /*roundtrip exclusions*/
1405
1406 "GURMUKHI-BENGALI", "[:GURMUKHI:]", "[:BENGALI:]",
1407 "[\\u0981\\u0982\\u09b6\\u09e2\\u09e3\\u09c3\\u09c4\\u09d7\\u098B\\u098C\\u09B7\\u09E0\\u09E1\\u09F0\\u09F1\\u09f2-\\u09fa\\u09ce]", /*roundtrip exclusions*/
1408
1409 "BENGALI-GURMUKHI", "[:BENGALI:]", "[:GURMUKHI:]",
1410 "[\\u0A02\\u0a5c\\u0a47\\u0a70\\u0a71\\u0A33\\u0A35\\u0A59\\u0A5A\\u0A5B\\u0A5E\\u0A72\\u0A73\\u0A74]", /*roundtrip exclusions*/
1411
1412 "GUJARATI-BENGALI", "[:GUJARATI:]", "[:BENGALI:]",
1413 "[\\u09d7\\u09e2\\u09e3\\u098c\\u09e1\\u09f0\\u09f1\\u09f2-\\u09fa\\u09ce]", /*roundtrip exclusions*/
1414
1415 "BENGALI-GUJARATI", "[:BENGALI:]", "[:GUJARATI:]",
1416 "[\\u0A82\\u0a83\\u0Ac9\\u0Ac5\\u0ac7\\u0A8D\\u0A91\\u0AB3\\u0AB5\\u0ABD\\u0AD0]", /*roundtrip exclusions*/
1417
1418 "ORIYA-BENGALI", "[:ORIYA:]", "[:BENGALI:]",
1419 "[\\u09c4\\u09e2\\u09e3\\u09f0\\u09f1\\u09f2-\\u09fa\\u09ce]", /*roundtrip exclusions*/
1420
1421 "BENGALI-ORIYA", "[:BENGALI:]", "[:ORIYA:]",
1422 "[\\u0b35\\u0b71\\u0b5f\\u0b56\\u0b33\\u0b3d]", /*roundtrip exclusions*/
1423
1424 "Tamil-BENGALI", "[:tamil:]", "[:BENGALI:]",
1425 "[\\u0981\\u09bc\\u09c3\\u09c4\\u09e2\\u09e3\\u09f0\\u09f1\\u098B\\u098C\\u0996\\u0997\\u0998\\u099B\\u099D\\u09A0\\u09A1\\u09A2\\u09A5\\u09A6\\u09A7\\u09AB\\u09AC\\u09AD\\u09B6\\u09DC\\u09DD\\u09DF\\u09E0\\u09E1\\u09f2-\\u09fa\\u09ce]", /*roundtrip exclusions*/
1426
1427 "BENGALI-Tamil", "[:BENGALI:]", "[:tamil:]",
1428 "[\\u0bc6\\u0bc7\\u0bca\\u0B8E\\u0B92\\u0BA9\\u0BB1\\u0BB3\\u0BB4\\u0BB5\\u0BF0\\u0BF1\\u0BF2]", /*roundtrip exclusions*/
1429
1430 "Telugu-BENGALI", "[:telugu:]", "[:BENGALI:]",
1431 "[\\u09e2\\u09e3\\u09bc\\u09d7\\u09f0\\u09f1\\u09dc\\u09dd\\u09df\\u09f2-\\u09fa\\u09ce]", /*roundtrip exclusions*/
1432
1433 "BENGALI-TELUGU", "[:BENGALI:]", "[:TELUGU:]",
1434 "[\\u0c55\\u0c56\\u0c47\\u0c46\\u0c4a\\u0C0E\\u0C12\\u0C31\\u0C33\\u0C35]", /*roundtrip exclusions*/
1435
1436 "KANNADA-BENGALI", "[:KANNADA:]", "[:BENGALI:]",
1437 "[\\u0981\\u09e2\\u09e3\\u09bc\\u09d7\\u09dc\\u09dd\\u09df\\u09f0\\u09f1\\u09f2-\\u09fa\\u09ce]", /*roundtrip exclusions*/
1438
1439 "BENGALI-KANNADA", "[:BENGALI:]", "[:KANNADA:]",
1440 "[{\\u0cb0\\u0cbc}{\\u0cb3\\u0cbc}\\u0cc6\\u0cca\\u0cd5\\u0cd6\\u0cc7\\u0C8E\\u0C92\\u0CB1\\u0cb3\\u0cb5\\u0cde]", /*roundtrip exclusions*/
1441
1442 "MALAYALAM-BENGALI", "[:MALAYALAM:]", "[:BENGALI:]",
1443 "[\\u0981\\u09e2\\u09e3\\u09bc\\u09c4\\u09f0\\u09f1\\u09dc\\u09dd\\u09df\\u09dc\\u09dd\\u09df\\u09f2-\\u09fa\\u09ce]", /*roundtrip exclusions*/
1444
1445 "BENGALI-MALAYALAM", "[:BENGALI:]", "[:MALAYALAM:]",
1446 "[\\u0d46\\u0d4a\\u0d47\\u0d31-\\u0d35\\u0d0e\\u0d12]", /*roundtrip exclusions*/
1447
1448 "GUJARATI-GURMUKHI", "[:GUJARATI:]", "[:GURMUKHI:]",
1449 "[\\u0A02\\u0ab3\\u0ab6\\u0A70\\u0a71\\u0a82\\u0a83\\u0ac3\\u0ac4\\u0ac5\\u0ac9\\u0a5c\\u0a72\\u0a73\\u0a74\\u0a8b\\u0a8d\\u0a91\\u0abd]", /*roundtrip exclusions*/
1450
1451 "GURMUKHI-GUJARATI", "[:GURMUKHI:]", "[:GUJARATI:]",
1452 "[\\u0a5c\\u0A70\\u0a71\\u0a72\\u0a73\\u0a74\\u0a82\\u0a83\\u0a8b\\u0a8c\\u0a8d\\u0a91\\u0ab3\\u0ab6\\u0ab7\\u0abd\\u0ac3\\u0ac4\\u0ac5\\u0ac9\\u0ad0\\u0ae0\\u0ae1]", /*roundtrip exclusions*/
1453
1454 "ORIYA-GURMUKHI", "[:ORIYA:]", "[:GURMUKHI:]",
1455 "[\\u0A01\\u0A02\\u0a5c\\u0a21\\u0a47\\u0a71\\u0b02\\u0b03\\u0b33\\u0b36\\u0b43\\u0b56\\u0b57\\u0B0B\\u0B0C\\u0B37\\u0B3D\\u0B5F\\u0B60\\u0B61\\u0a35\\u0a72\\u0a73\\u0a74]", /*roundtrip exclusions*/
1456
1457 "GURMUKHI-ORIYA", "[:GURMUKHI:]", "[:ORIYA:]",
1458 "[\\u0b01\\u0b02\\u0b03\\u0b33\\u0b36\\u0b43\\u0b56\\u0b57\\u0B0B\\u0B0C\\u0B37\\u0B3D\\u0B5F\\u0B60\\u0B61\\u0b70\\u0b71]", /*roundtrip exclusions*/
1459
1460 "TAMIL-GURMUKHI", "[:TAMIL:]", "[:GURMUKHI:]",
1461 "[\\u0A01\\u0A02\\u0a33\\u0a36\\u0a3c\\u0a70\\u0a71\\u0a47\\u0A16\\u0A17\\u0A18\\u0A1B\\u0A1D\\u0A20\\u0A21\\u0A22\\u0A25\\u0A26\\u0A27\\u0A2B\\u0A2C\\u0A2D\\u0A59\\u0A5A\\u0A5B\\u0A5C\\u0A5E\\u0A72\\u0A73\\u0A74]", /*roundtrip exclusions*/
1462
1463 "GURMUKHI-TAMIL", "[:GURMUKHI:]", "[:TAMIL:]",
1464 "[\\u0b82\\u0bc6\\u0bca\\u0bd7\\u0bb7\\u0bb3\\u0b83\\u0B8E\\u0B92\\u0BA9\\u0BB1\\u0BB4\\u0bb6\\u0BF0\\u0BF1\\u0BF2]", /*roundtrip exclusions*/
1465
1466 "TELUGU-GURMUKHI", "[:TELUGU:]", "[:GURMUKHI:]",
1467 "[\\u0A02\\u0a33\\u0a36\\u0a3c\\u0a70\\u0a71\\u0A59\\u0A5A\\u0A5B\\u0A5C\\u0A5E\\u0A72\\u0A73\\u0A74]", /*roundtrip exclusions*/
1468
1469 "GURMUKHI-TELUGU", "[:GURMUKHI:]", "[:TELUGU:]",
1470 "[\\u0c01\\u0c02\\u0c03\\u0c33\\u0c36\\u0c44\\u0c43\\u0c46\\u0c4a\\u0c56\\u0c55\\u0C0B\\u0C0C\\u0C0E\\u0C12\\u0C31\\u0C37\\u0C60\\u0C61]", /*roundtrip exclusions*/
1471
1472 "KANNADA-GURMUKHI", "[:KANNADA:]", "[:GURMUKHI:]",
1473 "[\\u0A01\\u0A02\\u0a33\\u0a36\\u0a3c\\u0a70\\u0a71\\u0A59\\u0A5A\\u0A5B\\u0A5C\\u0A5E\\u0A72\\u0A73\\u0A74]", /*roundtrip exclusions*/
1474
1475 "GURMUKHI-KANNADA", "[:GURMUKHI:]", "[:KANNADA:]",
1476 "[{\\u0cb0\\u0cbc}{\\u0cb3\\u0cbc}\\u0c82\\u0c83\\u0cb3\\u0cb6\\u0cc4\\u0cc3\\u0cc6\\u0cca\\u0cd5\\u0cd6\\u0C8B\\u0C8C\\u0C8E\\u0C92\\u0CB1\\u0CB7\\u0cbd\\u0CE0\\u0CE1\\u0cde]", /*roundtrip exclusions*/
1477
1478 "MALAYALAM-GURMUKHI", "[:MALAYALAM:]", "[:GURMUKHI:]",
1479 "[\\u0A01\\u0A02\\u0a4b\\u0a4c\\u0a33\\u0a36\\u0a3c\\u0a70\\u0a71\\u0A59\\u0A5A\\u0A5B\\u0A5C\\u0A5E\\u0A72\\u0A73\\u0A74]", /*roundtrip exclusions*/
1480
1481 "GURMUKHI-MALAYALAM", "[:GURMUKHI:]", "[:MALAYALAM:]",
1482 "[\\u0d02\\u0d03\\u0d33\\u0d36\\u0d43\\u0d46\\u0d4a\\u0d4c\\u0d57\\u0D0B\\u0D0C\\u0D0E\\u0D12\\u0D31\\u0D34\\u0D37\\u0D60\\u0D61]", /*roundtrip exclusions*/
1483
1484 "GUJARATI-ORIYA", "[:GUJARATI:]", "[:ORIYA:]",
1485 "[\\u0b56\\u0b57\\u0B0C\\u0B5F\\u0B61\\u0b70\\u0b71]", /*roundtrip exclusions*/
1486
1487 "ORIYA-GUJARATI", "[:ORIYA:]", "[:GUJARATI:]",
1488 "[\\u0Ac4\\u0Ac5\\u0Ac9\\u0Ac7\\u0A8D\\u0A91\\u0AB5\\u0Ad0]", /*roundtrip exclusions*/
1489
1490 "TAMIL-GUJARATI", "[:TAMIL:]", "[:GUJARATI:]",
1491 "[\\u0A81\\u0a8c\\u0abc\\u0ac3\\u0Ac4\\u0Ac5\\u0Ac9\\u0Ac7\\u0A8B\\u0A8D\\u0A91\\u0A96\\u0A97\\u0A98\\u0A9B\\u0A9D\\u0AA0\\u0AA1\\u0AA2\\u0AA5\\u0AA6\\u0AA7\\u0AAB\\u0AAC\\u0AAD\\u0AB6\\u0ABD\\u0AD0\\u0AE0\\u0AE1]", /*roundtrip exclusions*/
1492
1493 "GUJARATI-TAMIL", "[:GUJARATI:]", "[:TAMIL:]",
1494 "[\\u0Bc6\\u0Bca\\u0Bd7\\u0B8E\\u0B92\\u0BA9\\u0BB1\\u0BB4\\u0BF0\\u0BF1\\u0BF2]", /*roundtrip exclusions*/
1495
1496 "TELUGU-GUJARATI", "[:TELUGU:]", "[:GUJARATI:]",
1497 "[\\u0abc\\u0Ac5\\u0Ac9\\u0A8D\\u0A91\\u0ABD\\u0Ad0]", /*roundtrip exclusions*/
1498
1499 "GUJARATI-TELUGU", "[:GUJARATI:]", "[:TELUGU:]",
1500 "[\\u0c46\\u0c4a\\u0c55\\u0c56\\u0C0C\\u0C0E\\u0C12\\u0C31\\u0C61]", /*roundtrip exclusions*/
1501
1502 "KANNADA-GUJARATI", "[:KANNADA:]", "[:GUJARATI:]",
1503 "[\\u0A81\\u0abc\\u0Ac5\\u0Ac9\\u0A8D\\u0A91\\u0ABD\\u0Ad0]", /*roundtrip exclusions*/
1504
1505 "GUJARATI-KANNADA", "[:GUJARATI:]", "[:KANNADA:]",
1506 "[{\\u0cb0\\u0cbc}{\\u0cb3\\u0cbc}\\u0cc6\\u0cca\\u0cd5\\u0cd6\\u0C8C\\u0C8E\\u0C92\\u0CB1\\u0CDE\\u0CE1]", /*roundtrip exclusions*/
1507
1508 "MALAYALAM-GUJARATI", "[:MALAYALAM:]", "[:GUJARATI:]",
1509 "[\\u0A81\\u0ac4\\u0acb\\u0acc\\u0abc\\u0Ac5\\u0Ac9\\u0A8D\\u0A91\\u0ABD\\u0Ad0]", /*roundtrip exclusions*/
1510
1511 "GUJARATI-MALAYALAM", "[:GUJARATI:]", "[:MALAYALAM:]",
1512 "[\\u0d46\\u0d4a\\u0d4c\\u0d55\\u0d57\\u0D0C\\u0D0E\\u0D12\\u0D31\\u0D34\\u0D61]", /*roundtrip exclusions*/
1513
1514 "TAMIL-ORIYA", "[:TAMIL:]", "[:ORIYA:]",
1515 "[\\u0B01\\u0b3c\\u0b43\\u0b56\\u0B0B\\u0B0C\\u0B16\\u0B17\\u0B18\\u0B1B\\u0B1D\\u0B20\\u0B21\\u0B22\\u0B25\\u0B26\\u0B27\\u0B2B\\u0B2C\\u0B2D\\u0B36\\u0B3D\\u0B5C\\u0B5D\\u0B5F\\u0B60\\u0B61\\u0b70\\u0b71]", /*roundtrip exclusions*/
1516
1517 "ORIYA-TAMIL", "[:ORIYA:]", "[:TAMIL:]",
1518 "[\\u0bc6\\u0bca\\u0bc7\\u0B8E\\u0B92\\u0BA9\\u0BB1\\u0BB4\\u0BB5\\u0BF0\\u0BF1\\u0BF2]", /*roundtrip exclusions*/
1519
1520 "TELUGU-ORIYA", "[:TELUGU:]", "[:ORIYA:]",
1521 "[\\u0b3c\\u0b57\\u0b56\\u0B3D\\u0B5C\\u0B5D\\u0B5F\\u0b70\\u0b71]", /*roundtrip exclusions*/
1522
1523 "ORIYA-TELUGU", "[:ORIYA:]", "[:TELUGU:]",
1524 "[\\u0c44\\u0c46\\u0c4a\\u0c55\\u0c47\\u0C0E\\u0C12\\u0C31\\u0C35]", /*roundtrip exclusions*/
1525
1526 "KANNADA-ORIYA", "[:KANNADA:]", "[:ORIYA:]",
1527 "[\\u0B01\\u0b3c\\u0b57\\u0B3D\\u0B5C\\u0B5D\\u0B5F\\u0b70\\u0b71]", /*roundtrip exclusions*/
1528
1529 "ORIYA-KANNADA", "[:ORIYA:]", "[:KANNADA:]",
1530 "[{\\u0cb0\\u0cbc}{\\u0cb3\\u0cbc}\\u0cc4\\u0cc6\\u0cca\\u0cd5\\u0cc7\\u0C8E\\u0C92\\u0CB1\\u0CB5\\u0CDE]", /*roundtrip exclusions*/
1531
1532 "MALAYALAM-ORIYA", "[:MALAYALAM:]", "[:ORIYA:]",
1533 "[\\u0B01\\u0b3c\\u0b56\\u0B3D\\u0B5C\\u0B5D\\u0B5F\\u0b70\\u0b71]", /*roundtrip exclusions*/
1534
1535 "ORIYA-MALAYALAM", "[:ORIYA:]", "[:MALAYALAM:]",
1536 "[\\u0D47\\u0D46\\u0D4a\\u0D0E\\u0D12\\u0D31\\u0D34\\u0D35]", /*roundtrip exclusions*/
1537
1538 "TELUGU-TAMIL", "[:TELUGU:]", "[:TAMIL:]",
1539 "[\\u0bd7\\u0ba9\\u0bb4\\u0BF0\\u0BF1\\u0BF2]", /*roundtrip exclusions*/
1540
1541 "TAMIL-TELUGU", "[:TAMIL:]", "[:TELUGU:]",
1542 "[\\u0C01\\u0c43\\u0c44\\u0c46\\u0c47\\u0c55\\u0c56\\u0c66\\u0C0B\\u0C0C\\u0C16\\u0C17\\u0C18\\u0C1B\\u0C1D\\u0C20\\u0C21\\u0C22\\u0C25\\u0C26\\u0C27\\u0C2B\\u0C2C\\u0C2D\\u0C36\\u0C60\\u0C61]", /*roundtrip exclusions*/
1543
1544 "KANNADA-TAMIL", "[:KANNADA:]", "[:TAMIL:]",
1545 "[\\u0bd7\\u0bc6\\u0ba9\\u0bb4\\u0BF0\\u0BF1\\u0BF2]", /*roundtrip exclusions*/
1546
1547 "TAMIL-KANNADA", "[:TAMIL:]", "[:KANNADA:]",
1548 "[\\u0cc3\\u0cc4\\u0cc6\\u0cc7\\u0cd5\\u0cd6\\u0C8B\\u0C8C\\u0C96\\u0C97\\u0C98\\u0C9B\\u0C9D\\u0CA0\\u0CA1\\u0CA2\\u0CA5\\u0CA6\\u0CA7\\u0CAB\\u0CAC\\u0CAD\\u0CB6\\u0cbc\\u0cbd\\u0CDE\\u0CE0\\u0CE1]", /*roundtrip exclusions*/
1549
1550 "MALAYALAM-TAMIL", "[:MALAYALAM:]", "[:TAMIL:]",
1551 "[\\u0ba9\\u0BF0\\u0BF1\\u0BF2]", /*roundtrip exclusions*/
1552
1553 "TAMIL-MALAYALAM", "[:TAMIL:]", "[:MALAYALAM:]",
1554 "[\\u0d43\\u0d12\\u0D0B\\u0D0C\\u0D16\\u0D17\\u0D18\\u0D1B\\u0D1D\\u0D20\\u0D21\\u0D22\\u0D25\\u0D26\\u0D27\\u0D2B\\u0D2C\\u0D2D\\u0D36\\u0D60\\u0D61]", /*roundtrip exclusions*/
1555
1556 "KANNADA-TELUGU", "[:KANNADA:]", "[:TELUGU:]",
1557 "[\\u0C01\\u0c3f\\u0c46\\u0c48\\u0c4a]", /*roundtrip exclusions*/
1558
1559 "TELUGU-KANNADA", "[:TELUGU:]", "[:KANNADA:]",
1560 "[\\u0cc8\\u0cd5\\u0cd6\\u0cbc\\u0cbd\\u0CDE]", /*roundtrip exclusions*/
1561
1562 "MALAYALAM-TELUGU", "[:MALAYALAM:]", "[:TELUGU:]",
1563 "[\\u0C01\\u0c44\\u0c4a\\u0c4c\\u0c4b\\u0c55\\u0c56]", /*roundtrip exclusions*/
1564
1565 "TELUGU-MALAYALAM", "[:TELUGU:]", "[:MALAYALAM:]",
1566 "[\\u0d4c\\u0d57\\u0D34]", /*roundtrip exclusions*/
1567
1568 "MALAYALAM-KANNADA", "[:MALAYALAM:]", "[:KANNADA:]",
1569 "[\\u0cbc\\u0cbd\\u0cc4\\u0cc6\\u0cca\\u0ccc\\u0ccb\\u0cd5\\u0cd6\\u0cDe]", /*roundtrip exclusions*/
1570
1571 "KANNADA-MALAYALAM", "[:KANNADA:]", "[:MALAYALAM:]",
1572 "[\\u0d4c\\u0d57\\u0d46\\u0D34]", /*roundtrip exclusions*/
1573
1574 "Latin-Bengali",latinForIndic, "[[:Bengali:][\\u0964\\u0965]]",
1575 "[\\u0965\\u09f0-\\u09fa\\u09ce]" /*roundtrip exclusions*/ ,
1576
1577 "Latin-Gurmukhi", latinForIndic, "[[:Gurmukhi:][\\u0964\\u0965]]",
1578 "[\\u0a01\\u0965\\u0a02\\u0a72\\u0a73\\u0a74]" /*roundtrip exclusions*/,
1579
1580 "Latin-Gujarati",latinForIndic, "[[:Gujarati:][\\u0964\\u0965]]",
1581 "[\\u0965]" /*roundtrip exclusions*/,
1582
1583 "Latin-Oriya",latinForIndic, "[[:Oriya:][\\u0964\\u0965]]",
1584 "[\\u0965\\u0b70]" /*roundtrip exclusions*/,
1585
1586 "Latin-Tamil",latinForIndic, "[:Tamil:]",
1587 "[\\u0BF0\\u0BF1\\u0BF2]" /*roundtrip exclusions*/,
1588
1589 "Latin-Telugu",latinForIndic, "[:Telugu:]",
1590 NULL /*roundtrip exclusions*/,
1591
1592 "Latin-Kannada",latinForIndic, "[:Kannada:]",
1593 NULL /*roundtrip exclusions*/,
1594
1595 "Latin-Malayalam",latinForIndic, "[:Malayalam:]",
1596 NULL /*roundtrip exclusions*/
1597 };
1598
TestDebug(const char * name,const char fromSet[],const char * toSet,const char * exclusions)1599 void TransliteratorRoundTripTest::TestDebug(const char* name,const char fromSet[],
1600 const char* toSet,const char* exclusions){
1601
1602 RTTest test(name);
1603 Legal *legal = new LegalIndic();
1604 test.test(UnicodeString(fromSet,""),UnicodeString(toSet,""),exclusions,this,quick,legal);
1605 }
1606
TestInterIndic()1607 void TransliteratorRoundTripTest::TestInterIndic() {
1608 //TestDebug("Latin-Gurmukhi", latinForIndic, "[:Gurmukhi:]","[\\u0965\\u0a02\\u0a72\\u0a73\\u0a74]",true);
1609 int32_t num = UPRV_LENGTHOF(interIndicArray)/INTER_INDIC_ARRAY_WIDTH;
1610 if(quick){
1611 logln("Testing only 5 of %i. Skipping rest (use -e for exhaustive)",num);
1612 num = 5;
1613 }
1614 for(int i = 0; i < num;i++){
1615 RTTest test(interIndicArray[i*INTER_INDIC_ARRAY_WIDTH + 0]);
1616 Legal *legal = new LegalIndic();
1617 logln(UnicodeString("Stress testing ") + interIndicArray[i*INTER_INDIC_ARRAY_WIDTH + 0]);
1618 if( !logKnownIssue( "cldrbug:1911" ) ) {
1619 /* "full test" */
1620 // CLDR bug #1911: This test should be moved into CLDR.
1621 test.test( interIndicArray[i*INTER_INDIC_ARRAY_WIDTH + 1],
1622 interIndicArray[i*INTER_INDIC_ARRAY_WIDTH + 2],
1623 interIndicArray[i*INTER_INDIC_ARRAY_WIDTH + 3], // roundtrip exclusions
1624 this, quick, legal, 50);
1625 } else {
1626 // It is left in its current state as a regression test.
1627 // CLDR should test, and remove the age filter.
1628 /* regression test - ""temporary"" until CLDR#1911 is fixed */
1629 // start
1630 UnicodeString source("[");
1631 source.append(interIndicArray[i*INTER_INDIC_ARRAY_WIDTH + 1]);
1632 source.append(" & [:Age=4.1:]]");
1633 UnicodeString target("[");
1634 target.append(interIndicArray[i*INTER_INDIC_ARRAY_WIDTH + 2]);
1635 target.append(" & [:Age=4.1:]]");
1636 test.test( source,
1637 target,
1638 interIndicArray[i*INTER_INDIC_ARRAY_WIDTH + 3], // roundtrip exclusions
1639 this, quick, legal, 50);
1640 // end
1641 delete legal;
1642 }
1643 }
1644 }
1645
1646 // end indic tests ----------------------------------------------------------
1647
1648 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
1649