• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1999-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   11/10/99    aliu        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "transtst.h"
18 #include "unicode/locid.h"
19 #include "unicode/dtfmtsym.h"
20 #include "unicode/normlzr.h"
21 #include "unicode/translit.h"
22 #include "unicode/uchar.h"
23 #include "unicode/unifilt.h"
24 #include "unicode/uniset.h"
25 #include "unicode/ustring.h"
26 #include "unicode/usetiter.h"
27 #include "unicode/uscript.h"
28 #include "unicode/utf16.h"
29 #include "cpdtrans.h"
30 #include "nultrans.h"
31 #include "rbt.h"
32 #include "rbt_pars.h"
33 #include "anytrans.h"
34 #include "esctrn.h"
35 #include "name2uni.h"
36 #include "nortrans.h"
37 #include "remtrans.h"
38 #include "titletrn.h"
39 #include "tolowtrn.h"
40 #include "toupptrn.h"
41 #include "unesctrn.h"
42 #include "uni2name.h"
43 #include "cstring.h"
44 #include "cmemory.h"
45 #include <stdio.h>
46 
47 /***********************************************************************
48 
49                      HOW TO USE THIS TEST FILE
50                                -or-
51                   How I developed on two platforms
52                 without losing (too much of) my mind
53 
54 
55 1. Add new tests by copying/pasting/changing existing tests.  On Java,
56    any public void method named Test...() taking no parameters becomes
57    a test.  On C++, you need to modify the header and add a line to
58    the runIndexedTest() dispatch method.
59 
60 2. Make liberal use of the expect() method; it is your friend.
61 
62 3. The tests in this file exactly match those in a sister file on the
63    other side.  The two files are:
64 
65    icu4j:  src/com/ibm/test/translit/TransliteratorTest.java
66    icu4c:  source/test/intltest/transtst.cpp
67 
68                   ==> THIS IS THE IMPORTANT PART <==
69 
70    When you add a test in this file, add it in TransliteratorTest.java
71    too.  Give it the same name and put it in the same relative place.
72    This makes maintenance a lot simpler for any poor soul who ends up
73    trying to synchronize the tests between icu4j and icu4c.
74 
75 4. If you MUST enter a test that is NOT paralleled in the sister file,
76    then add it in the special non-mirrored section.  These are
77    labeled
78 
79      "icu4j ONLY"
80 
81    or
82 
83      "icu4c ONLY"
84 
85    Make sure you document the reason the test is here and not there.
86 
87 
88 Thank you.
89 The Management
90 ***********************************************************************/
91 
92 // Define character constants thusly to be EBCDIC-friendly
93 enum {
94     LEFT_BRACE=((UChar)0x007B), /*{*/
95     PIPE      =((UChar)0x007C), /*|*/
96     ZERO      =((UChar)0x0030), /*0*/
97     UPPER_A   =((UChar)0x0041)  /*A*/
98 };
99 
TransliteratorTest()100 TransliteratorTest::TransliteratorTest()
101 :   DESERET_DEE((UChar32)0x10414),
102     DESERET_dee((UChar32)0x1043C)
103 {
104 }
105 
~TransliteratorTest()106 TransliteratorTest::~TransliteratorTest() {}
107 
108 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)109 TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
110                                    const char* &name, char* /*par*/) {
111     switch (index) {
112         TESTCASE(0,TestInstantiation);
113         TESTCASE(1,TestSimpleRules);
114         TESTCASE(2,TestRuleBasedInverse);
115         TESTCASE(3,TestKeyboard);
116         TESTCASE(4,TestKeyboard2);
117         TESTCASE(5,TestKeyboard3);
118         TESTCASE(6,TestArabic);
119         TESTCASE(7,TestCompoundKana);
120         TESTCASE(8,TestCompoundHex);
121         TESTCASE(9,TestFiltering);
122         TESTCASE(10,TestInlineSet);
123         TESTCASE(11,TestPatternQuoting);
124         TESTCASE(12,TestJ277);
125         TESTCASE(13,TestJ243);
126         TESTCASE(14,TestJ329);
127         TESTCASE(15,TestSegments);
128         TESTCASE(16,TestCursorOffset);
129         TESTCASE(17,TestArbitraryVariableValues);
130         TESTCASE(18,TestPositionHandling);
131         TESTCASE(19,TestHiraganaKatakana);
132         TESTCASE(20,TestCopyJ476);
133         TESTCASE(21,TestAnchors);
134         TESTCASE(22,TestInterIndic);
135         TESTCASE(23,TestFilterIDs);
136         TESTCASE(24,TestCaseMap);
137         TESTCASE(25,TestNameMap);
138         TESTCASE(26,TestLiberalizedID);
139         TESTCASE(27,TestCreateInstance);
140         TESTCASE(28,TestNormalizationTransliterator);
141         TESTCASE(29,TestCompoundRBT);
142         TESTCASE(30,TestCompoundFilter);
143         TESTCASE(31,TestRemove);
144         TESTCASE(32,TestToRules);
145         TESTCASE(33,TestContext);
146         TESTCASE(34,TestSupplemental);
147         TESTCASE(35,TestQuantifier);
148         TESTCASE(36,TestSTV);
149         TESTCASE(37,TestCompoundInverse);
150         TESTCASE(38,TestNFDChainRBT);
151         TESTCASE(39,TestNullInverse);
152         TESTCASE(40,TestAliasInverseID);
153         TESTCASE(41,TestCompoundInverseID);
154         TESTCASE(42,TestUndefinedVariable);
155         TESTCASE(43,TestEmptyContext);
156         TESTCASE(44,TestCompoundFilterID);
157         TESTCASE(45,TestPropertySet);
158         TESTCASE(46,TestNewEngine);
159         TESTCASE(47,TestQuantifiedSegment);
160         TESTCASE(48,TestDevanagariLatinRT);
161         TESTCASE(49,TestTeluguLatinRT);
162         TESTCASE(50,TestCompoundLatinRT);
163         TESTCASE(51,TestSanskritLatinRT);
164         TESTCASE(52,TestLocaleInstantiation);
165         TESTCASE(53,TestTitleAccents);
166         TESTCASE(54,TestLocaleResource);
167         TESTCASE(55,TestParseError);
168         TESTCASE(56,TestOutputSet);
169         TESTCASE(57,TestVariableRange);
170         TESTCASE(58,TestInvalidPostContext);
171         TESTCASE(59,TestIDForms);
172         TESTCASE(60,TestToRulesMark);
173         TESTCASE(61,TestEscape);
174         TESTCASE(62,TestAnchorMasking);
175         TESTCASE(63,TestDisplayName);
176         TESTCASE(64,TestSpecialCases);
177 #if !UCONFIG_NO_FILE_IO
178         TESTCASE(65,TestIncrementalProgress);
179 #endif
180         TESTCASE(66,TestSurrogateCasing);
181         TESTCASE(67,TestFunction);
182         TESTCASE(68,TestInvalidBackRef);
183         TESTCASE(69,TestMulticharStringSet);
184         TESTCASE(70,TestUserFunction);
185         TESTCASE(71,TestAnyX);
186         TESTCASE(72,TestSourceTargetSet);
187         TESTCASE(73,TestGurmukhiDevanagari);
188         TESTCASE(74,TestPatternWhiteSpace);
189         TESTCASE(75,TestAllCodepoints);
190         TESTCASE(76,TestBoilerplate);
191         TESTCASE(77,TestAlternateSyntax);
192         TESTCASE(78,TestBeginEnd);
193         TESTCASE(79,TestBeginEndToRules);
194         TESTCASE(80,TestRegisterAlias);
195         TESTCASE(81,TestRuleStripping);
196         TESTCASE(82,TestHalfwidthFullwidth);
197         TESTCASE(83,TestThai);
198         TESTCASE(84,TestAny);
199         TESTCASE(85,TestBasicTransliteratorEvenWithoutData);
200         default: name = ""; break;
201     }
202 }
203 
204 /**
205  * Make sure every system transliterator can be instantiated.
206  *
207  * ALSO test that the result of toRules() for each rule is a valid
208  * rule.  Do this here so we don't have to have another test that
209  * instantiates everything as well.
210  */
TestInstantiation()211 void TransliteratorTest::TestInstantiation() {
212     UErrorCode ec = U_ZERO_ERROR;
213     StringEnumeration* avail = Transliterator::getAvailableIDs(ec);
214     assertSuccess("getAvailableIDs()", ec);
215     assertTrue("getAvailableIDs()!=NULL", avail!=NULL);
216     int32_t n = Transliterator::countAvailableIDs();
217     assertTrue("getAvailableIDs().count()==countAvailableIDs()",
218                avail->count(ec) == n);
219     assertSuccess("count()", ec);
220     UnicodeString name;
221     for (int32_t i=0; i<n; ++i) {
222         const UnicodeString& id = *avail->snext(ec);
223         if (!assertSuccess("snext()", ec) ||
224             !assertTrue("snext()!=NULL", (&id)!=NULL, true)) {
225             break;
226         }
227         UnicodeString id2 = Transliterator::getAvailableID(i);
228         if (id.length() < 1) {
229             errln(UnicodeString("FAIL: getAvailableID(") +
230                   i + ") returned empty string");
231             continue;
232         }
233         if (id != id2) {
234             errln(UnicodeString("FAIL: getAvailableID(") +
235                   i + ") != getAvailableIDs().snext()");
236             continue;
237         }
238         UParseError parseError;
239         UErrorCode status = U_ZERO_ERROR;
240         Transliterator* t = Transliterator::createInstance(id,
241                               UTRANS_FORWARD, parseError,status);
242         name.truncate(0);
243         Transliterator::getDisplayName(id, name);
244         if (t == 0) {
245 #if UCONFIG_NO_BREAK_ITERATION
246             // If UCONFIG_NO_BREAK_ITERATION is on, then only Thai should fail.
247             if (id.compare((UnicodeString)"Thai-Latn") != 0 &&
248                 id.compare((UnicodeString)"Thai-Latin") != 0)
249 #endif
250                 dataerrln(UnicodeString("FAIL: Couldn't create ") + id +
251                       /*", parse error " + parseError.code +*/
252                       ", line " + parseError.line +
253                       ", offset " + parseError.offset +
254                       ", pre-context " + prettify(parseError.preContext, true) +
255                       ", post-context " +prettify(parseError.postContext,true) +
256                       ", Error: " + u_errorName(status));
257                 // When createInstance fails, it deletes the failing
258                 // entry from the available ID list.  We detect this
259                 // here by looking for a change in countAvailableIDs.
260             int32_t nn = Transliterator::countAvailableIDs();
261             if (nn == (n - 1)) {
262                 n = nn;
263                 --i; // Compensate for deleted entry
264             }
265         } else {
266             logln(UnicodeString("OK: ") + name + " (" + id + ")");
267 
268             // Now test toRules
269             UnicodeString rules;
270             t->toRules(rules, true);
271             Transliterator *u = Transliterator::createFromRules("x",
272                                     rules, UTRANS_FORWARD, parseError,status);
273             if (u == 0) {
274                 errln(UnicodeString("FAIL: ") + id +
275                       ".createFromRules() => bad rules" +
276                       /*", parse error " + parseError.code +*/
277                       ", line " + parseError.line +
278                       ", offset " + parseError.offset +
279                       ", context " + prettify(parseError.preContext, true) +
280                       ", rules: " + prettify(rules, true));
281             } else {
282                 delete u;
283             }
284             delete t;
285         }
286     }
287     assertTrue("snext()==NULL", avail->snext(ec)==NULL);
288     assertSuccess("snext()", ec);
289     delete avail;
290 
291     // Now test the failure path
292     UParseError parseError;
293     UErrorCode status = U_ZERO_ERROR;
294     UnicodeString id("<Not a valid Transliterator ID>");
295     Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
296     if (t != 0) {
297         errln("FAIL: " + id + " returned a transliterator");
298         delete t;
299     } else {
300         logln("OK: Bogus ID handled properly");
301     }
302 }
303 
TestSimpleRules(void)304 void TransliteratorTest::TestSimpleRules(void) {
305     /* Example: rules 1. ab>x|y
306      *                2. yc>z
307      *
308      * []|eabcd  start - no match, copy e to translated buffer
309      * [e]|abcd  match rule 1 - copy output & adjust cursor
310      * [ex|y]cd  match rule 2 - copy output & adjust cursor
311      * [exz]|d   no match, copy d to transliterated buffer
312      * [exzd]|   done
313      */
314     expect(UnicodeString("ab>x|y;", "") +
315            "yc>z",
316            "eabcd", "exzd");
317 
318     /* Another set of rules:
319      *    1. ab>x|yzacw
320      *    2. za>q
321      *    3. qc>r
322      *    4. cw>n
323      *
324      * []|ab       Rule 1
325      * [x|yzacw]   No match
326      * [xy|zacw]   Rule 2
327      * [xyq|cw]    Rule 4
328      * [xyqn]|     Done
329      */
330     expect(UnicodeString("ab>x|yzacw;") +
331            "za>q;" +
332            "qc>r;" +
333            "cw>n",
334            "ab", "xyqn");
335 
336     /* Test categories
337      */
338     UErrorCode status = U_ZERO_ERROR;
339     UParseError parseError;
340     Transliterator *t = Transliterator::createFromRules(
341         "<ID>",
342         UnicodeString("$dummy=").append((UChar)0xE100) +
343         UnicodeString(";"
344                       "$vowel=[aeiouAEIOU];"
345                       "$lu=[:Lu:];"
346                       "$vowel } $lu > '!';"
347                       "$vowel > '&';"
348                       "'!' { $lu > '^';"
349                       "$lu > '*';"
350                       "a > ERROR", ""),
351         UTRANS_FORWARD, parseError,
352         status);
353     if (U_FAILURE(status)) {
354         dataerrln("FAIL: RBT constructor failed - %s", u_errorName(status));
355         return;
356     }
357     expect(*t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
358     delete t;
359 }
360 
361 /**
362  * Test inline set syntax and set variable syntax.
363  */
TestInlineSet(void)364 void TransliteratorTest::TestInlineSet(void) {
365     expect("{ [:Ll:] } x > y; [:Ll:] > z;", "aAbxq", "zAyzz");
366     expect("a[0-9]b > qrs", "1a7b9", "1qrs9");
367 
368     expect(UnicodeString(
369            "$digit = [0-9];"
370            "$alpha = [a-zA-Z];"
371            "$alphanumeric = [$digit $alpha];" // ***
372            "$special = [^$alphanumeric];"     // ***
373            "$alphanumeric > '-';"
374            "$special > '*';", ""),
375 
376            "thx-1138", "---*----");
377 }
378 
379 /**
380  * Create some inverses and confirm that they work.  We have to be
381  * careful how we do this, since the inverses will not be true
382  * inverses -- we can't throw any random string at the composition
383  * of the transliterators and expect the identity function.  F x
384  * F' != I.  However, if we are careful about the input, we will
385  * get the expected results.
386  */
TestRuleBasedInverse(void)387 void TransliteratorTest::TestRuleBasedInverse(void) {
388     UnicodeString RULES =
389         UnicodeString("abc>zyx;") +
390         "ab>yz;" +
391         "bc>zx;" +
392         "ca>xy;" +
393         "a>x;" +
394         "b>y;" +
395         "c>z;" +
396 
397         "abc<zyx;" +
398         "ab<yz;" +
399         "bc<zx;" +
400         "ca<xy;" +
401         "a<x;" +
402         "b<y;" +
403         "c<z;" +
404 
405         "";
406 
407     const char* DATA[] = {
408         // Careful here -- random strings will not work.  If we keep
409         // the left side to the domain and the right side to the range
410         // we will be okay though (left, abc; right xyz).
411         "a", "x",
412         "abcacab", "zyxxxyy",
413         "caccb", "xyzzy",
414     };
415 
416     int32_t DATA_length = UPRV_LENGTHOF(DATA);
417 
418     UErrorCode status = U_ZERO_ERROR;
419     UParseError parseError;
420     Transliterator *fwd = Transliterator::createFromRules("<ID>", RULES,
421                                 UTRANS_FORWARD, parseError, status);
422     Transliterator *rev = Transliterator::createFromRules("<ID>", RULES,
423                                 UTRANS_REVERSE, parseError, status);
424     if (U_FAILURE(status)) {
425         errln("FAIL: RBT constructor failed");
426         return;
427     }
428     for (int32_t i=0; i<DATA_length; i+=2) {
429         expect(*fwd, DATA[i], DATA[i+1]);
430         expect(*rev, DATA[i+1], DATA[i]);
431     }
432     delete fwd;
433     delete rev;
434 }
435 
436 /**
437  * Basic test of keyboard.
438  */
TestKeyboard(void)439 void TransliteratorTest::TestKeyboard(void) {
440     UParseError parseError;
441     UErrorCode status = U_ZERO_ERROR;
442     Transliterator *t = Transliterator::createFromRules("<ID>",
443                               UnicodeString("psch>Y;")
444                               +"ps>y;"
445                               +"ch>x;"
446                               +"a>A;",
447                               UTRANS_FORWARD, parseError,
448                               status);
449     if (U_FAILURE(status)) {
450         errln("FAIL: RBT constructor failed");
451         return;
452     }
453     const char* DATA[] = {
454         // insertion, buffer
455         "a", "A",
456         "p", "Ap",
457         "s", "Aps",
458         "c", "Apsc",
459         "a", "AycA",
460         "psch", "AycAY",
461         0, "AycAY", // null means finishKeyboardTransliteration
462     };
463 
464     keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
465     delete t;
466 }
467 
468 /**
469  * Basic test of keyboard with cursor.
470  */
TestKeyboard2(void)471 void TransliteratorTest::TestKeyboard2(void) {
472     UParseError parseError;
473     UErrorCode status = U_ZERO_ERROR;
474     Transliterator *t = Transliterator::createFromRules("<ID>",
475                               UnicodeString("ych>Y;")
476                               +"ps>|y;"
477                               +"ch>x;"
478                               +"a>A;",
479                               UTRANS_FORWARD, parseError,
480                               status);
481     if (U_FAILURE(status)) {
482         errln("FAIL: RBT constructor failed");
483         return;
484     }
485     const char* DATA[] = {
486         // insertion, buffer
487         "a", "A",
488         "p", "Ap",
489         "s", "Aps", // modified for rollback - "Ay",
490         "c", "Apsc", // modified for rollback - "Ayc",
491         "a", "AycA",
492         "p", "AycAp",
493         "s", "AycAps", // modified for rollback - "AycAy",
494         "c", "AycApsc", // modified for rollback - "AycAyc",
495         "h", "AycAY",
496         0, "AycAY", // null means finishKeyboardTransliteration
497     };
498 
499     keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
500     delete t;
501 }
502 
503 /**
504  * Test keyboard transliteration with back-replacement.
505  */
TestKeyboard3(void)506 void TransliteratorTest::TestKeyboard3(void) {
507     // We want th>z but t>y.  Furthermore, during keyboard
508     // transliteration we want t>y then yh>z if t, then h are
509     // typed.
510     UnicodeString RULES("t>|y;"
511                         "yh>z;");
512 
513     const char* DATA[] = {
514         // Column 1: characters to add to buffer (as if typed)
515         // Column 2: expected appearance of buffer after
516         //           keyboard xliteration.
517         "a", "a",
518         "b", "ab",
519         "t", "abt", // modified for rollback - "aby",
520         "c", "abyc",
521         "t", "abyct", // modified for rollback - "abycy",
522         "h", "abycz",
523         0, "abycz", // null means finishKeyboardTransliteration
524     };
525 
526     UParseError parseError;
527     UErrorCode status = U_ZERO_ERROR;
528     Transliterator *t = Transliterator::createFromRules("<ID>", RULES, UTRANS_FORWARD, parseError, status);
529     if (U_FAILURE(status)) {
530         errln("FAIL: RBT constructor failed");
531         return;
532     }
533     keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
534     delete t;
535 }
536 
keyboardAux(const Transliterator & t,const char * DATA[],int32_t DATA_length)537 void TransliteratorTest::keyboardAux(const Transliterator& t,
538                                      const char* DATA[], int32_t DATA_length) {
539     UErrorCode status = U_ZERO_ERROR;
540     UTransPosition index={0, 0, 0, 0};
541     UnicodeString s;
542     for (int32_t i=0; i<DATA_length; i+=2) {
543         UnicodeString log;
544         if (DATA[i] != 0) {
545             log = s + " + "
546                 + DATA[i]
547                 + " -> ";
548             t.transliterate(s, index, DATA[i], status);
549         } else {
550             log = s + " => ";
551             t.finishTransliteration(s, index);
552         }
553         // Show the start index '{' and the cursor '|'
554         UnicodeString a, b, c;
555         s.extractBetween(0, index.contextStart, a);
556         s.extractBetween(index.contextStart, index.start, b);
557         s.extractBetween(index.start, s.length(), c);
558         log.append(a).
559             append((UChar)LEFT_BRACE).
560             append(b).
561             append((UChar)PIPE).
562             append(c);
563         if (s == DATA[i+1] && U_SUCCESS(status)) {
564             logln(log);
565         } else {
566             errln(UnicodeString("FAIL: ") + log + ", expected " + DATA[i+1]);
567         }
568     }
569 }
570 
TestArabic(void)571 void TransliteratorTest::TestArabic(void) {
572 // Test disabled for 2.0 until new Arabic transliterator can be written.
573 //    /*
574 //    const char* DATA[] = {
575 //        "Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+
576 //                  "\u0627\u0644\u0644\u063a\u0629\u0020"+
577 //                  "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
578 //                  "\u0628\u0628\u0646\u0638\u0645\u0020"+
579 //                  "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
580 //                  "\u062c\u0645\u064a\u0644\u0629",
581 //    };
582 //    */
583 //
584 //    UChar ar_raw[] = {
585 //        0x062a, 0x062a, 0x0645, 0x062a, 0x0639, 0x0020, 0x0627,
586 //        0x0644, 0x0644, 0x063a, 0x0629, 0x0020, 0x0627, 0x0644,
587 //        0x0639, 0x0631, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
588 //        0x0628, 0x0628, 0x0646, 0x0638, 0x0645, 0x0020, 0x0643,
589 //        0x062a, 0x0627, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
590 //        0x062c, 0x0645, 0x064a, 0x0644, 0x0629, 0
591 //    };
592 //    UnicodeString ar(ar_raw);
593 //    UErrorCode status=U_ZERO_ERROR;
594 //    UParseError parseError;
595 //    Transliterator *t = Transliterator::createInstance("Latin-Arabic", UTRANS_FORWARD, parseError, status);
596 //    if (t == 0) {
597 //        errln("FAIL: createInstance failed");
598 //        return;
599 //    }
600 //    expect(*t, "Arabic", ar);
601 //    delete t;
602 }
603 
604 /**
605  * Compose the Kana transliterator forward and reverse and try
606  * some strings that should come out unchanged.
607  */
TestCompoundKana(void)608 void TransliteratorTest::TestCompoundKana(void) {
609     UParseError parseError;
610     UErrorCode status = U_ZERO_ERROR;
611     Transliterator* t = Transliterator::createInstance("Latin-Hiragana;Hiragana-Latin", UTRANS_FORWARD, parseError, status);
612     if (t == 0) {
613         dataerrln("FAIL: construction of Latin-Hiragana;Hiragana-Latin failed - %s", u_errorName(status));
614     } else {
615         expect(*t, "aaaaa", "aaaaa");
616         delete t;
617     }
618 }
619 
620 /**
621  * Compose the hex transliterators forward and reverse.
622  */
TestCompoundHex(void)623 void TransliteratorTest::TestCompoundHex(void) {
624     UParseError parseError;
625     UErrorCode status = U_ZERO_ERROR;
626     Transliterator* a = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
627     Transliterator* b = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, parseError, status);
628     Transliterator* transab[] = { a, b };
629     Transliterator* transba[] = { b, a };
630     if (a == 0 || b == 0) {
631         errln("FAIL: construction failed");
632         delete a;
633         delete b;
634         return;
635     }
636     // Do some basic tests of a
637     expect(*a, "01", UnicodeString("\\u0030\\u0031", ""));
638     // Do some basic tests of b
639     expect(*b, UnicodeString("\\u0030\\u0031", ""), "01");
640 
641     Transliterator* ab = new CompoundTransliterator(transab, 2);
642     UnicodeString s("abcde", "");
643     expect(*ab, s, s);
644 
645     UnicodeString str(s);
646     a->transliterate(str);
647     Transliterator* ba = new CompoundTransliterator(transba, 2);
648     expect(*ba, str, str);
649 
650     delete ab;
651     delete ba;
652     delete a;
653     delete b;
654 }
655 
656 int gTestFilterClassID = 0;
657 /**
658  * Used by TestFiltering().
659  */
660 class TestFilter : public UnicodeFilter {
clone() const661     virtual TestFilter* clone() const override {
662         return new TestFilter(*this);
663     }
contains(UChar32 c) const664     virtual UBool contains(UChar32 c) const override {
665         return c != (UChar)0x0063 /*c*/;
666     }
667     // Stubs
toPattern(UnicodeString & result,UBool) const668     virtual UnicodeString& toPattern(UnicodeString& result,
669                                      UBool /*escapeUnprintable*/) const override {
670         return result;
671     }
matchesIndexValue(uint8_t) const672     virtual UBool matchesIndexValue(uint8_t /*v*/) const override {
673         return false;
674     }
addMatchSetTo(UnicodeSet &) const675     virtual void addMatchSetTo(UnicodeSet& /*toUnionTo*/) const override {}
676 public:
getDynamicClassID() const677     UClassID getDynamicClassID() const override { return (UClassID)&gTestFilterClassID; }
678 };
679 
680 /**
681  * Do some basic tests of filtering.
682  */
TestFiltering(void)683 void TransliteratorTest::TestFiltering(void) {
684     UParseError parseError;
685     UErrorCode status = U_ZERO_ERROR;
686     Transliterator* hex = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
687     if (hex == 0) {
688         errln("FAIL: createInstance(Any-Hex) failed");
689         return;
690     }
691     hex->adoptFilter(new TestFilter());
692     UnicodeString s("abcde");
693     hex->transliterate(s);
694     UnicodeString exp("\\u0061\\u0062c\\u0064\\u0065", "");
695     if (s == exp) {
696         logln(UnicodeString("Ok:   \"") + exp + "\"");
697     } else {
698         logln(UnicodeString("FAIL: \"") + s + "\", wanted \"" + exp + "\"");
699     }
700 
701     // ICU4C ONLY. Do not find Transliterator.orphanFilter() in ICU4J.
702     UnicodeFilter *f = hex->orphanFilter();
703     if (f == NULL){
704         errln("FAIL: orphanFilter() should get a UnicodeFilter");
705     } else {
706         delete f;
707     }
708     delete hex;
709 }
710 
711 /**
712  * Test anchors
713  */
TestAnchors(void)714 void TransliteratorTest::TestAnchors(void) {
715     expect(UnicodeString("^a  > 0; a$ > 2 ; a > 1;", ""),
716            "aaa",
717            "012");
718     expect(UnicodeString("$s=[z$]; $s{a>0; a}$s>2; a>1;", ""),
719            "aaa",
720            "012");
721     expect(UnicodeString("^ab  > 01 ;"
722            " ab  > |8 ;"
723            "  b  > k ;"
724            " 8x$ > 45 ;"
725            " 8x  > 77 ;", ""),
726 
727            "ababbabxabx",
728            "018k7745");
729     expect(UnicodeString("$s = [z$] ;"
730            "$s{ab    > 01 ;"
731            "   ab    > |8 ;"
732            "    b    > k ;"
733            "   8x}$s > 45 ;"
734            "   8x    > 77 ;", ""),
735 
736            "abzababbabxzabxabx",
737            "01z018k45z01x45");
738 }
739 
740 /**
741  * Test pattern quoting and escape mechanisms.
742  */
TestPatternQuoting(void)743 void TransliteratorTest::TestPatternQuoting(void) {
744     // Array of 3n items
745     // Each item is <rules>, <input>, <expected output>
746     const UnicodeString DATA[] = {
747         UnicodeString(UChar(0x4E01)) + ">'[male adult]'",
748         UnicodeString(UChar(0x4E01)),
749         "[male adult]"
750     };
751 
752     for (int32_t i=0; i<3; i+=3) {
753         logln(UnicodeString("Pattern: ") + prettify(DATA[i]));
754         UParseError parseError;
755         UErrorCode status = U_ZERO_ERROR;
756         Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
757         if (U_FAILURE(status)) {
758             errln("RBT constructor failed");
759         } else {
760             expect(*t, DATA[i+1], DATA[i+2]);
761         }
762         delete t;
763     }
764 }
765 
766 /**
767  * Regression test for bugs found in Greek transliteration.
768  */
TestJ277(void)769 void TransliteratorTest::TestJ277(void) {
770     UErrorCode status = U_ZERO_ERROR;
771     UParseError parseError;
772     Transliterator *gl = Transliterator::createInstance("Greek-Latin; NFD; [:M:]Remove; NFC", UTRANS_FORWARD, parseError, status);
773     if (gl == NULL) {
774         dataerrln("FAIL: createInstance(Greek-Latin) returned NULL - %s", u_errorName(status));
775         return;
776     }
777 
778     UChar sigma = 0x3C3;
779     UChar upsilon = 0x3C5;
780     UChar nu = 0x3BD;
781 //    UChar PHI = 0x3A6;
782     UChar alpha = 0x3B1;
783 //    UChar omega = 0x3C9;
784 //    UChar omicron = 0x3BF;
785 //    UChar epsilon = 0x3B5;
786 
787     // sigma upsilon nu -> syn
788     UnicodeString syn;
789     syn.append(sigma).append(upsilon).append(nu);
790     expect(*gl, syn, "syn");
791 
792     // sigma alpha upsilon nu -> saun
793     UnicodeString sayn;
794     sayn.append(sigma).append(alpha).append(upsilon).append(nu);
795     expect(*gl, sayn, "saun");
796 
797     // Again, using a smaller rule set
798     UnicodeString rules(
799                 "$alpha   = \\u03B1;"
800                 "$nu      = \\u03BD;"
801                 "$sigma   = \\u03C3;"
802                 "$ypsilon = \\u03C5;"
803                 "$vowel   = [aeiouAEIOU$alpha$ypsilon];"
804                 "s <>           $sigma;"
805                 "a <>           $alpha;"
806                 "u <>  $vowel { $ypsilon;"
807                 "y <>           $ypsilon;"
808                 "n <>           $nu;",
809                 "");
810     Transliterator *mini = Transliterator::createFromRules("mini", rules, UTRANS_REVERSE, parseError, status);
811     if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
812     expect(*mini, syn, "syn");
813     expect(*mini, sayn, "saun");
814     delete mini;
815     mini = NULL;
816 
817 #if !UCONFIG_NO_FORMATTING
818     // Transliterate the Greek locale data
819     Locale el("el");
820     DateFormatSymbols syms(el, status);
821     if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
822     int32_t i, count;
823     const UnicodeString* data = syms.getMonths(count);
824     for (i=0; i<count; ++i) {
825         if (data[i].length() == 0) {
826             continue;
827         }
828         UnicodeString out(data[i]);
829         gl->transliterate(out);
830         UBool ok = true;
831         if (data[i].length() >= 2 && out.length() >= 2 &&
832             u_isupper(data[i].charAt(0)) && u_islower(data[i].charAt(1))) {
833             if (!(u_isupper(out.charAt(0)) && u_islower(out.charAt(1)))) {
834                 ok = false;
835             }
836         }
837         if (ok) {
838             logln(prettify(data[i] + " -> " + out));
839         } else {
840             errln(UnicodeString("FAIL: ") + prettify(data[i] + " -> " + out));
841         }
842     }
843 #endif
844 
845     delete gl;
846 }
847 
848 /**
849  * Prefix, suffix support in hex transliterators
850  */
TestJ243(void)851 void TransliteratorTest::TestJ243(void) {
852     UErrorCode ec = U_ZERO_ERROR;
853 
854     // Test default Hex-Any, which should handle
855     // \u, \U, u+, and U+
856     Transliterator *hex =
857         Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, ec);
858     if (assertSuccess("getInstance", ec)) {
859         expect(*hex, UnicodeString("\\u0041+\\U00000042,U+0043uU+0044z", ""), "A+B,CuDz");
860     }
861     delete hex;
862 
863 //    // Try a custom Hex-Unicode
864 //    // \uXXXX and &#xXXXX;
865 //    ec = U_ZERO_ERROR;
866 //    HexToUnicodeTransliterator hex2(UnicodeString("\\\\u###0;&\\#x###0\\;", ""), ec);
867 //    expect(hex2, UnicodeString("\\u61\\u062\\u0063\\u00645\\u66x&#x30;&#x031;&#x0032;&#x00033;", ""),
868 //           "abcd5fx012&#x00033;");
869 //    // Try custom Any-Hex (default is tested elsewhere)
870 //    ec = U_ZERO_ERROR;
871 //    UnicodeToHexTransliterator hex3(UnicodeString("&\\#x###0;", ""), ec);
872 //    expect(hex3, "012", "&#x30;&#x31;&#x32;");
873 }
874 
875 /**
876  * Parsers need better syntax error messages.
877  */
TestJ329(void)878 void TransliteratorTest::TestJ329(void) {
879 
880     struct { UBool containsErrors; const char* rule; } DATA[] = {
881         { false, "a > b; c > d" },
882         { true,  "a > b; no operator; c > d" },
883     };
884     int32_t DATA_length = UPRV_LENGTHOF(DATA);
885 
886     for (int32_t i=0; i<DATA_length; ++i) {
887         UErrorCode status = U_ZERO_ERROR;
888         UParseError parseError;
889         Transliterator *rbt = Transliterator::createFromRules("<ID>",
890                                     DATA[i].rule,
891                                     UTRANS_FORWARD,
892                                     parseError,
893                                     status);
894         UBool gotError = U_FAILURE(status);
895         UnicodeString desc(DATA[i].rule);
896         desc.append(gotError ? " -> error" : " -> no error");
897         if (gotError) {
898             desc = desc + ", ParseError code=" + u_errorName(status) +
899                 " line=" + parseError.line +
900                 " offset=" + parseError.offset +
901                 " context=" + parseError.preContext;
902         }
903         if (gotError == DATA[i].containsErrors) {
904             logln(UnicodeString("Ok:   ") + desc);
905         } else {
906             errln(UnicodeString("FAIL: ") + desc);
907         }
908         delete rbt;
909     }
910 }
911 
912 /**
913  * Test segments and segment references.
914  */
TestSegments(void)915 void TransliteratorTest::TestSegments(void) {
916     // Array of 3n items
917     // Each item is <rules>, <input>, <expected output>
918     UnicodeString DATA[] = {
919         "([a-z]) '.' ([0-9]) > $2 '-' $1",
920         "abc.123.xyz.456",
921         "ab1-c23.xy4-z56",
922 
923         // nested
924         "(([a-z])([0-9])) > $1 '.' $2 '.' $3;",
925         "a1 b2",
926         "a1.a.1 b2.b.2",
927     };
928     int32_t DATA_length = UPRV_LENGTHOF(DATA);
929 
930     for (int32_t i=0; i<DATA_length; i+=3) {
931         logln("Pattern: " + prettify(DATA[i]));
932         UParseError parseError;
933         UErrorCode status = U_ZERO_ERROR;
934         Transliterator *t = Transliterator::createFromRules("ID", DATA[i], UTRANS_FORWARD, parseError, status);
935         if (U_FAILURE(status)) {
936             errln("FAIL: RBT constructor");
937         } else {
938             expect(*t, DATA[i+1], DATA[i+2]);
939         }
940         delete t;
941     }
942 }
943 
944 /**
945  * Test cursor positioning outside of the key
946  */
TestCursorOffset(void)947 void TransliteratorTest::TestCursorOffset(void) {
948     // Array of 3n items
949     // Each item is <rules>, <input>, <expected output>
950     UnicodeString DATA[] = {
951         "pre {alpha} post > | @ ALPHA ;"
952         "eALPHA > beta ;"
953         "pre {beta} post > BETA @@ | ;"
954         "post > xyz",
955 
956         "prealphapost prebetapost",
957 
958         "prbetaxyz preBETApost",
959     };
960     int32_t DATA_length = UPRV_LENGTHOF(DATA);
961 
962     for (int32_t i=0; i<DATA_length; i+=3) {
963         logln("Pattern: " + prettify(DATA[i]));
964         UParseError parseError;
965         UErrorCode status = U_ZERO_ERROR;
966         Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
967         if (U_FAILURE(status)) {
968             errln("FAIL: RBT constructor");
969         } else {
970             expect(*t, DATA[i+1], DATA[i+2]);
971         }
972         delete t;
973     }
974 }
975 
976 /**
977  * Test zero length and > 1 char length variable values.  Test
978  * use of variable refs in UnicodeSets.
979  */
TestArbitraryVariableValues(void)980 void TransliteratorTest::TestArbitraryVariableValues(void) {
981     // Array of 3n items
982     // Each item is <rules>, <input>, <expected output>
983     UnicodeString DATA[] = {
984         "$abe = ab;"
985         "$pat = x[yY]z;"
986         "$ll  = 'a-z';"
987         "$llZ = [$ll];"
988         "$llY = [$ll$pat];"
989         "$emp = ;"
990 
991         "$abe > ABE;"
992         "$pat > END;"
993         "$llZ > 1;"
994         "$llY > 2;"
995         "7$emp 8 > 9;"
996         "",
997 
998         "ab xYzxyz stY78",
999         "ABE ENDEND 1129",
1000     };
1001     int32_t DATA_length = UPRV_LENGTHOF(DATA);
1002 
1003     for (int32_t i=0; i<DATA_length; i+=3) {
1004         logln("Pattern: " + prettify(DATA[i]));
1005         UParseError parseError;
1006         UErrorCode status = U_ZERO_ERROR;
1007         Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
1008         if (U_FAILURE(status)) {
1009             errln("FAIL: RBT constructor");
1010         } else {
1011             expect(*t, DATA[i+1], DATA[i+2]);
1012         }
1013         delete t;
1014     }
1015 }
1016 
1017 /**
1018  * Confirm that the contextStart, contextLimit, start, and limit
1019  * behave correctly. J474.
1020  */
TestPositionHandling(void)1021 void TransliteratorTest::TestPositionHandling(void) {
1022     // Array of 3n items
1023     // Each item is <rules>, <input>, <expected output>
1024     const char* DATA[] = {
1025         "a{t} > SS ; {t}b > UU ; {t} > TT ;",
1026         "xtat txtb", // pos 0,9,0,9
1027         "xTTaSS TTxUUb",
1028 
1029         "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1030         "xtat txtb", // pos 2,9,3,8
1031         "xtaSS TTxUUb",
1032 
1033         "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1034         "xtat txtb", // pos 3,8,3,8
1035         "xtaTT TTxTTb",
1036     };
1037 
1038     // Array of 4n positions -- these go with the DATA array
1039     // They are: contextStart, contextLimit, start, limit
1040     int32_t POS[] = {
1041         0, 9, 0, 9,
1042         2, 9, 3, 8,
1043         3, 8, 3, 8,
1044     };
1045 
1046     int32_t n = UPRV_LENGTHOF(DATA) / 3;
1047     for (int32_t i=0; i<n; i++) {
1048         UErrorCode status = U_ZERO_ERROR;
1049         UParseError parseError;
1050         Transliterator *t = Transliterator::createFromRules("<ID>",
1051                                 DATA[3*i], UTRANS_FORWARD, parseError, status);
1052         if (U_FAILURE(status)) {
1053             delete t;
1054             errln("FAIL: RBT constructor");
1055             return;
1056         }
1057         UTransPosition pos;
1058         pos.contextStart= POS[4*i];
1059         pos.contextLimit = POS[4*i+1];
1060         pos.start = POS[4*i+2];
1061         pos.limit = POS[4*i+3];
1062         UnicodeString rsource(DATA[3*i+1]);
1063         t->transliterate(rsource, pos, status);
1064         if (U_FAILURE(status)) {
1065             delete t;
1066             errln("FAIL: transliterate");
1067             return;
1068         }
1069         t->finishTransliteration(rsource, pos);
1070         expectAux(DATA[3*i],
1071                   DATA[3*i+1],
1072                   rsource,
1073                   DATA[3*i+2]);
1074         delete t;
1075     }
1076 }
1077 
1078 /**
1079  * Test the Hiragana-Katakana transliterator.
1080  */
TestHiraganaKatakana(void)1081 void TransliteratorTest::TestHiraganaKatakana(void) {
1082     UParseError parseError;
1083     UErrorCode status = U_ZERO_ERROR;
1084     Transliterator* hk = Transliterator::createInstance("Hiragana-Katakana", UTRANS_FORWARD, parseError, status);
1085     Transliterator* kh = Transliterator::createInstance("Katakana-Hiragana", UTRANS_FORWARD, parseError, status);
1086     if (hk == 0 || kh == 0) {
1087         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1088         delete hk;
1089         delete kh;
1090         return;
1091     }
1092 
1093     // Array of 3n items
1094     // Each item is "hk"|"kh"|"both", <Hiragana>, <Katakana>
1095     const char* DATA[] = {
1096         "both",
1097         "\\u3042\\u3090\\u3099\\u3092\\u3050",
1098         "\\u30A2\\u30F8\\u30F2\\u30B0",
1099 
1100         "kh",
1101         "\\u307C\\u3051\\u3060\\u3042\\u3093\\u30FC",
1102         "\\u30DC\\u30F6\\u30C0\\u30FC\\u30F3\\u30FC",
1103     };
1104     int32_t DATA_length = UPRV_LENGTHOF(DATA);
1105 
1106     for (int32_t i=0; i<DATA_length; i+=3) {
1107         UnicodeString h = CharsToUnicodeString(DATA[i+1]);
1108         UnicodeString k = CharsToUnicodeString(DATA[i+2]);
1109         switch (*DATA[i]) {
1110         case 0x68: //'h': // Hiragana-Katakana
1111             expect(*hk, h, k);
1112             break;
1113         case 0x6B: //'k': // Katakana-Hiragana
1114             expect(*kh, k, h);
1115             break;
1116         case 0x62: //'b': // both
1117             expect(*hk, h, k);
1118             expect(*kh, k, h);
1119             break;
1120         }
1121     }
1122     delete hk;
1123     delete kh;
1124 }
1125 
1126 /**
1127  * Test cloning / copy constructor of RBT.
1128  */
TestCopyJ476(void)1129 void TransliteratorTest::TestCopyJ476(void) {
1130     // The real test here is what happens when the destructors are
1131     // called.  So we let one object get destructed, and check to
1132     // see that its copy still works.
1133     Transliterator *t2 = 0;
1134     {
1135         UParseError parseError;
1136         UErrorCode status = U_ZERO_ERROR;
1137         Transliterator *t1 = Transliterator::createFromRules("t1",
1138             "a>A;b>B;'foo'+>'bar'", UTRANS_FORWARD, parseError, status);
1139         if (U_FAILURE(status)) {
1140             errln("FAIL: RBT constructor");
1141             return;
1142         }
1143         t2 = t1->clone(); // Call copy constructor under the covers.
1144         expect(*t1, "abcfoofoo", "ABcbar");
1145         delete t1;
1146     }
1147     expect(*t2, "abcfoofoo", "ABcbar");
1148     delete t2;
1149 }
1150 
1151 /**
1152  * Test inter-Indic transliterators.  These are composed.
1153  * ICU4C Jitterbug 483.
1154  */
TestInterIndic(void)1155 void TransliteratorTest::TestInterIndic(void) {
1156     UnicodeString ID("Devanagari-Gujarati", "");
1157     UErrorCode status = U_ZERO_ERROR;
1158     UParseError parseError;
1159     Transliterator* dg = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1160     if (dg == 0) {
1161         dataerrln("FAIL: createInstance(" + ID + ") returned NULL - " + u_errorName(status));
1162         return;
1163     }
1164     UnicodeString id = dg->getID();
1165     if (id != ID) {
1166         errln("FAIL: createInstance(" + ID + ")->getID() => " + id);
1167     }
1168     UnicodeString dev = CharsToUnicodeString("\\u0901\\u090B\\u0925");
1169     UnicodeString guj = CharsToUnicodeString("\\u0A81\\u0A8B\\u0AA5");
1170     expect(*dg, dev, guj);
1171     delete dg;
1172 }
1173 
1174 /**
1175  * Test filter syntax in IDs. (J918)
1176  */
TestFilterIDs(void)1177 void TransliteratorTest::TestFilterIDs(void) {
1178     // Array of 3n strings:
1179     // <id>, <inverse id>, <input>, <expected output>
1180     const char* DATA[] = {
1181         "[aeiou]Any-Hex", // ID
1182         "[aeiou]Hex-Any", // expected inverse ID
1183         "quizzical",      // src
1184         "q\\u0075\\u0069zz\\u0069c\\u0061l", // expected ID.translit(src)
1185 
1186         "[aeiou]Any-Hex;[^5]Hex-Any",
1187         "[^5]Any-Hex;[aeiou]Hex-Any",
1188         "quizzical",
1189         "q\\u0075izzical",
1190 
1191         "[abc]Null",
1192         "[abc]Null",
1193         "xyz",
1194         "xyz",
1195     };
1196     enum { DATA_length = UPRV_LENGTHOF(DATA) };
1197 
1198     for (int i=0; i<DATA_length; i+=4) {
1199         UnicodeString ID(DATA[i], "");
1200         UnicodeString uID(DATA[i+1], "");
1201         UnicodeString data2(DATA[i+2], "");
1202         UnicodeString data3(DATA[i+3], "");
1203         UParseError parseError;
1204         UErrorCode status = U_ZERO_ERROR;
1205         Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1206         if (t == 0) {
1207             errln("FAIL: createInstance(" + ID + ") returned NULL");
1208             return;
1209         }
1210         expect(*t, data2, data3);
1211 
1212         // Check the ID
1213         if (ID != t->getID()) {
1214             errln("FAIL: createInstance(" + ID + ").getID() => " +
1215                   t->getID());
1216         }
1217 
1218         // Check the inverse
1219         Transliterator *u = t->createInverse(status);
1220         if (u == 0) {
1221             errln("FAIL: " + ID + ".createInverse() returned NULL");
1222         } else if (u->getID() != uID) {
1223             errln("FAIL: " + ID + ".createInverse().getID() => " +
1224                   u->getID() + ", expected " + uID);
1225         }
1226 
1227         delete t;
1228         delete u;
1229     }
1230 }
1231 
1232 /**
1233  * Test the case mapping transliterators.
1234  */
TestCaseMap(void)1235 void TransliteratorTest::TestCaseMap(void) {
1236     UParseError parseError;
1237     UErrorCode status = U_ZERO_ERROR;
1238     Transliterator* toUpper =
1239         Transliterator::createInstance("Any-Upper[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1240     Transliterator* toLower =
1241         Transliterator::createInstance("Any-Lower[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1242     Transliterator* toTitle =
1243         Transliterator::createInstance("Any-Title[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1244     if (toUpper==0 || toLower==0 || toTitle==0) {
1245         errln("FAIL: createInstance returned NULL");
1246         delete toUpper;
1247         delete toLower;
1248         delete toTitle;
1249         return;
1250     }
1251 
1252     expect(*toUpper, "The quick brown fox jumped over the lazy dogs.",
1253            "THE QUICK BROWN FOx JUMPED OVER THE LAzy DOGS.");
1254     expect(*toLower, "The quIck brown fOX jUMPED OVER THE LAzY dogs.",
1255            "the quick brown foX jumped over the lazY dogs.");
1256     expect(*toTitle, "the quick brown foX can't jump over the laZy dogs.",
1257            "The Quick Brown FoX Can't Jump Over The LaZy Dogs.");
1258 
1259     delete toUpper;
1260     delete toLower;
1261     delete toTitle;
1262 }
1263 
1264 /**
1265  * Test the name mapping transliterators.
1266  */
TestNameMap(void)1267 void TransliteratorTest::TestNameMap(void) {
1268     UParseError parseError;
1269     UErrorCode status = U_ZERO_ERROR;
1270     Transliterator* uni2name =
1271         Transliterator::createInstance("Any-Name[^abc]", UTRANS_FORWARD, parseError, status);
1272     Transliterator* name2uni =
1273         Transliterator::createInstance("Name-Any", UTRANS_FORWARD, parseError, status);
1274     if (uni2name==0 || name2uni==0) {
1275         errln("FAIL: createInstance returned NULL");
1276         delete uni2name;
1277         delete name2uni;
1278         return;
1279     }
1280 
1281     // Careful:  CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1282     expect(*uni2name, CharsToUnicodeString("\\u00A0abc\\u4E01\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF"),
1283            CharsToUnicodeString("\\\\N{NO-BREAK SPACE}abc\\\\N{CJK UNIFIED IDEOGRAPH-4E01}\\\\N{MICRO SIGN}\\\\N{GUJARATI SIGN CANDRABINDU}\\\\N{REPLACEMENT CHARACTER}\\\\N{<control-0004>}\\\\N{<control-0009>}\\\\N{<control-0081>}\\\\N{<noncharacter-FFFF>}"));
1284     expect(*name2uni, UNICODE_STRING_SIMPLE("{\\N { NO-BREAK SPACE}abc\\N{  CJK UNIFIED  IDEOGRAPH-4E01  }\\N{x\\N{MICRO SIGN}\\N{GUJARATI SIGN CANDRABINDU}\\N{REPLACEMENT CHARACTER}\\N{<control-0004>}\\N{<control-0009>}\\N{<control-0081>}\\N{<noncharacter-FFFF>}\\N{<control-0004>}\\N{"),
1285            CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{"));
1286 
1287     delete uni2name;
1288     delete name2uni;
1289 
1290     // round trip
1291     Transliterator* t =
1292         Transliterator::createInstance("Any-Name;Name-Any", UTRANS_FORWARD, parseError, status);
1293     if (t==0) {
1294         errln("FAIL: createInstance returned NULL");
1295         delete t;
1296         return;
1297     }
1298 
1299     // Careful:  CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1300     UnicodeString s = CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{");
1301     expect(*t, s, s);
1302     delete t;
1303 }
1304 
1305 /**
1306  * Test liberalized ID syntax.  1006c
1307  */
TestLiberalizedID(void)1308 void TransliteratorTest::TestLiberalizedID(void) {
1309     // Some test cases have an expected getID() value of NULL.  This
1310     // means I have disabled the test case for now.  This stuff is
1311     // still under development, and I haven't decided whether to make
1312     // getID() return canonical case yet.  It will all get rewritten
1313     // with the move to Source-Target/Variant IDs anyway. [aliu]
1314     const char* DATA[] = {
1315         "latin-greek", NULL /*"Latin-Greek"*/, "case insensitivity",
1316         "  Null  ", "Null", "whitespace",
1317         " Latin[a-z]-Greek  ", "[a-z]Latin-Greek", "inline filter",
1318         "  null  ; latin-greek  ", NULL /*"Null;Latin-Greek"*/, "compound whitespace",
1319     };
1320     const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1321     UParseError parseError;
1322     UErrorCode status= U_ZERO_ERROR;
1323     for (int32_t i=0; i<DATA_length; i+=3) {
1324         Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, parseError, status);
1325         if (t == 0) {
1326             dataerrln(UnicodeString("FAIL: ") + DATA[i+2] +
1327                   " cannot create ID \"" + DATA[i] + "\" - " + u_errorName(status));
1328         } else {
1329             UnicodeString exp;
1330             if (DATA[i+1]) {
1331                 exp = UnicodeString(DATA[i+1], "");
1332             }
1333             // Don't worry about getID() if the expected char*
1334             // is NULL -- see above.
1335             if (exp.length() == 0 || exp == t->getID()) {
1336                 logln(UnicodeString("Ok: ") + DATA[i+2] +
1337                       " create ID \"" + DATA[i] + "\" => \"" +
1338                       exp + "\"");
1339             } else {
1340                 errln(UnicodeString("FAIL: ") + DATA[i+2] +
1341                       " create ID \"" + DATA[i] + "\" => \"" +
1342                       t->getID() + "\", exp \"" + exp + "\"");
1343             }
1344             delete t;
1345         }
1346     }
1347 }
1348 
1349 /* test for Jitterbug 912 */
TestCreateInstance()1350 void TransliteratorTest::TestCreateInstance(){
1351     const char* FORWARD = "F";
1352     const char* REVERSE = "R";
1353     const char* DATA[] = {
1354         // Column 1: id
1355         // Column 2: direction
1356         // Column 3: expected ID, or "" if expect failure
1357         "Latin-Hangul", REVERSE, "Hangul-Latin", // JB#912
1358 
1359         // JB#2689: bad compound causes crash
1360         "InvalidSource-InvalidTarget", FORWARD, "",
1361         "InvalidSource-InvalidTarget", REVERSE, "",
1362         "Hex-Any;InvalidSource-InvalidTarget", FORWARD, "",
1363         "Hex-Any;InvalidSource-InvalidTarget", REVERSE, "",
1364         "InvalidSource-InvalidTarget;Hex-Any", FORWARD, "",
1365         "InvalidSource-InvalidTarget;Hex-Any", REVERSE, "",
1366 
1367         NULL
1368     };
1369 
1370     for (int32_t i=0; DATA[i]; i+=3) {
1371         UParseError err;
1372         UErrorCode ec = U_ZERO_ERROR;
1373         UnicodeString id(DATA[i]);
1374         UTransDirection dir = (DATA[i+1]==FORWARD)?
1375             UTRANS_FORWARD:UTRANS_REVERSE;
1376         UnicodeString expID(DATA[i+2]);
1377         Transliterator* t =
1378             Transliterator::createInstance(id,dir,err,ec);
1379         UnicodeString newID;
1380         if (t) {
1381             newID = t->getID();
1382         }
1383         UBool ok = (newID == expID);
1384         if (!t) {
1385             newID = u_errorName(ec);
1386         }
1387         if (ok) {
1388             logln((UnicodeString)"Ok: createInstance(" +
1389                   id + "," + DATA[i+1] + ") => " + newID);
1390         } else {
1391             dataerrln((UnicodeString)"FAIL: createInstance(" +
1392                   id + "," + DATA[i+1] + ") => " + newID +
1393                   ", expected " + expID);
1394         }
1395         delete t;
1396     }
1397 }
1398 
1399 /**
1400  * Test the normalization transliterator.
1401  */
TestNormalizationTransliterator()1402 void TransliteratorTest::TestNormalizationTransliterator() {
1403     // THE FOLLOWING TWO TABLES ARE COPIED FROM com.ibm.test.normalizer.BasicTest
1404     // PLEASE KEEP THEM IN SYNC WITH BasicTest.
1405     const char* CANON[] = {
1406         // Input               Decomposed            Composed
1407         "cat",                "cat",                "cat"               ,
1408         "\\u00e0ardvark",      "a\\u0300ardvark",     "\\u00e0ardvark"    ,
1409 
1410         "\\u1e0a",             "D\\u0307",            "\\u1e0a"            , // D-dot_above
1411         "D\\u0307",            "D\\u0307",            "\\u1e0a"            , // D dot_above
1412 
1413         "\\u1e0c\\u0307",       "D\\u0323\\u0307",      "\\u1e0c\\u0307"      , // D-dot_below dot_above
1414         "\\u1e0a\\u0323",       "D\\u0323\\u0307",      "\\u1e0c\\u0307"      , // D-dot_above dot_below
1415         "D\\u0307\\u0323",      "D\\u0323\\u0307",      "\\u1e0c\\u0307"      , // D dot_below dot_above
1416 
1417         "\\u1e10\\u0307\\u0323", "D\\u0327\\u0323\\u0307","\\u1e10\\u0323\\u0307", // D dot_below cedilla dot_above
1418         "D\\u0307\\u0328\\u0323","D\\u0328\\u0323\\u0307","\\u1e0c\\u0328\\u0307", // D dot_above ogonek dot_below
1419 
1420         "\\u1E14",             "E\\u0304\\u0300",      "\\u1E14"            , // E-macron-grave
1421         "\\u0112\\u0300",       "E\\u0304\\u0300",      "\\u1E14"            , // E-macron + grave
1422         "\\u00c8\\u0304",       "E\\u0300\\u0304",      "\\u00c8\\u0304"      , // E-grave + macron
1423 
1424         "\\u212b",             "A\\u030a",            "\\u00c5"            , // angstrom_sign
1425         "\\u00c5",             "A\\u030a",            "\\u00c5"            , // A-ring
1426 
1427         "\\u00fdffin",         "y\\u0301ffin",        "\\u00fdffin"        ,    //updated with 3.0
1428         "\\u00fd\\uFB03n",      "y\\u0301\\uFB03n",     "\\u00fd\\uFB03n"     , //updated with 3.0
1429 
1430         "Henry IV",           "Henry IV",           "Henry IV"          ,
1431         "Henry \\u2163",       "Henry \\u2163",       "Henry \\u2163"      ,
1432 
1433         "\\u30AC",             "\\u30AB\\u3099",       "\\u30AC"            , // ga (Katakana)
1434         "\\u30AB\\u3099",       "\\u30AB\\u3099",       "\\u30AC"            , // ka + ten
1435         "\\uFF76\\uFF9E",       "\\uFF76\\uFF9E",       "\\uFF76\\uFF9E"      , // hw_ka + hw_ten
1436         "\\u30AB\\uFF9E",       "\\u30AB\\uFF9E",       "\\u30AB\\uFF9E"      , // ka + hw_ten
1437         "\\uFF76\\u3099",       "\\uFF76\\u3099",       "\\uFF76\\u3099"      , // hw_ka + ten
1438 
1439         "A\\u0300\\u0316",      "A\\u0316\\u0300",      "\\u00C0\\u0316"      ,
1440         0 // end
1441     };
1442 
1443     const char* COMPAT[] = {
1444         // Input               Decomposed            Composed
1445         "\\uFB4f",             "\\u05D0\\u05DC",       "\\u05D0\\u05DC"     , // Alef-Lamed vs. Alef, Lamed
1446 
1447         "\\u00fdffin",         "y\\u0301ffin",        "\\u00fdffin"        ,    //updated for 3.0
1448         "\\u00fd\\uFB03n",      "y\\u0301ffin",        "\\u00fdffin"        , // ffi ligature -> f + f + i
1449 
1450         "Henry IV",           "Henry IV",           "Henry IV"          ,
1451         "Henry \\u2163",       "Henry IV",           "Henry IV"          ,
1452 
1453         "\\u30AC",             "\\u30AB\\u3099",       "\\u30AC"            , // ga (Katakana)
1454         "\\u30AB\\u3099",       "\\u30AB\\u3099",       "\\u30AC"            , // ka + ten
1455 
1456         "\\uFF76\\u3099",       "\\u30AB\\u3099",       "\\u30AC"            , // hw_ka + ten
1457         0 // end
1458     };
1459 
1460     int32_t i;
1461     UParseError parseError;
1462     UErrorCode status = U_ZERO_ERROR;
1463     Transliterator* NFD = Transliterator::createInstance("NFD", UTRANS_FORWARD, parseError, status);
1464     Transliterator* NFC = Transliterator::createInstance("NFC", UTRANS_FORWARD, parseError, status);
1465     if (!NFD || !NFC) {
1466         dataerrln("FAIL: createInstance failed: %s", u_errorName(status));
1467         delete NFD;
1468         delete NFC;
1469         return;
1470     }
1471     for (i=0; CANON[i]; i+=3) {
1472         UnicodeString in = CharsToUnicodeString(CANON[i]);
1473         UnicodeString expd = CharsToUnicodeString(CANON[i+1]);
1474         UnicodeString expc = CharsToUnicodeString(CANON[i+2]);
1475         expect(*NFD, in, expd);
1476         expect(*NFC, in, expc);
1477     }
1478     delete NFD;
1479     delete NFC;
1480 
1481     Transliterator* NFKD = Transliterator::createInstance("NFKD", UTRANS_FORWARD, parseError, status);
1482     Transliterator* NFKC = Transliterator::createInstance("NFKC", UTRANS_FORWARD, parseError, status);
1483     if (!NFKD || !NFKC) {
1484         dataerrln("FAIL: createInstance failed");
1485         delete NFKD;
1486         delete NFKC;
1487         return;
1488     }
1489     for (i=0; COMPAT[i]; i+=3) {
1490         UnicodeString in = CharsToUnicodeString(COMPAT[i]);
1491         UnicodeString expkd = CharsToUnicodeString(COMPAT[i+1]);
1492         UnicodeString expkc = CharsToUnicodeString(COMPAT[i+2]);
1493         expect(*NFKD, in, expkd);
1494         expect(*NFKC, in, expkc);
1495     }
1496     delete NFKD;
1497     delete NFKC;
1498 
1499     UParseError pe;
1500     status = U_ZERO_ERROR;
1501     Transliterator *t = Transliterator::createInstance("NFD; [x]Remove",
1502                                                        UTRANS_FORWARD,
1503                                                        pe, status);
1504     if (t == 0) {
1505         errln("FAIL: createInstance failed");
1506     }
1507     expect(*t, CharsToUnicodeString("\\u010dx"),
1508            CharsToUnicodeString("c\\u030C"));
1509     delete t;
1510 }
1511 
1512 /**
1513  * Test we can create basic transliterator even without data.
1514  */
TestBasicTransliteratorEvenWithoutData()1515 void TransliteratorTest::TestBasicTransliteratorEvenWithoutData() {
1516     const char16_t* TEST_DATA = u"\u0124e\u0301 \uFB01nd x";
1517     const char16_t* EXPECTED_RESULTS[] = {
1518         u"H\u0302e\u0301 \uFB01nd x",  // NFD
1519         u"\u0124\u00E9 \uFB01nd x",  // NFC
1520         u"H\u0302e\u0301 find x",  // NFKD
1521         u"\u0124\u00E9 find x",  // NFKC
1522         u"\u0124e\u0301 \uFB01nd x",  // Hex-Any
1523         u"\u0125e\u0301 \uFB01nd x",  // Lower
1524         u"\u0124e\uFB01ndx",  // [:^L:]Remove
1525         u"H\u0302e\u0301 \uFB01nd ",  // NFD; [x]Remove
1526         u"h\u0302e\u0301 find x",  // Lower; NFKD;
1527         u"hefindx",  // Lower; NFKD; [:^L:]Remove; NFC;
1528         u"\u0124e \uFB01nd x",  // [:Nonspacing Mark:] Remove;
1529         u"He \uFB01nd x",  // NFD; [:Nonspacing Mark:] Remove; NFC;
1530         // end
1531         0
1532     };
1533 
1534     const char* BASIC_TRANSLITERATOR_ID[] = {
1535         "NFD",
1536         "NFC",
1537         "NFKD",
1538         "NFKC",
1539         "Hex-Any",
1540         "Lower",
1541         "[:^L:]Remove",
1542         "NFD; [x]Remove",
1543         "Lower; NFKD;",
1544         "Lower; NFKD; [:^L:]Remove; NFC;",
1545         "[:Nonspacing Mark:] Remove;",
1546         "NFD; [:Nonspacing Mark:] Remove; NFC;",
1547         // end
1548         0
1549     };
1550     const char* BASIC_TRANSLITERATOR_RULES[] = {
1551         "::Lower; ::NFKD;",
1552         "::Lower; ::NFKD; ::[:^L:]Remove; ::NFC;",
1553         "::[:Nonspacing Mark:] Remove;",
1554         "::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;",
1555         // end
1556         0
1557     };
1558     for (int32_t i=0; BASIC_TRANSLITERATOR_ID[i]; i++) {
1559         UErrorCode status = U_ZERO_ERROR;
1560         UParseError parseError;
1561         std::unique_ptr<Transliterator> translit(Transliterator::createInstance(
1562             BASIC_TRANSLITERATOR_ID[i], UTRANS_FORWARD, parseError, status));
1563         if (translit.get() == nullptr || !U_SUCCESS(status)) {
1564             dataerrln("FAIL: createInstance %s failed", BASIC_TRANSLITERATOR_ID[i]);
1565             continue;
1566         }
1567         UnicodeString data(TEST_DATA);
1568         UnicodeString expected(EXPECTED_RESULTS[i]);
1569         translit->transliterate(data);
1570         if (data != expected) {
1571             dataerrln(UnicodeString("FAIL: expected translit(") +
1572                       BASIC_TRANSLITERATOR_ID[i] + ") = '" +
1573                       EXPECTED_RESULTS[i] + "' but got '" + data);
1574             continue;
1575         }
1576     }
1577     for (int32_t i=0; BASIC_TRANSLITERATOR_RULES[i]; i++) {
1578         UErrorCode status = U_ZERO_ERROR;
1579         UParseError parseError;
1580         std::unique_ptr<Transliterator> translit(Transliterator::createFromRules(
1581             "Test",
1582             BASIC_TRANSLITERATOR_RULES[i], UTRANS_FORWARD, parseError, status));
1583         if (translit.get() == nullptr || !U_SUCCESS(status)) {
1584             dataerrln("FAIL: createFromRules %s failed", BASIC_TRANSLITERATOR_RULES[i]);
1585             continue;
1586         }
1587     }
1588 }
1589 
1590 /**
1591  * Test compound RBT rules.
1592  */
TestCompoundRBT(void)1593 void TransliteratorTest::TestCompoundRBT(void) {
1594     // Careful with spacing and ';' here:  Phrase this exactly
1595     // as toRules() is going to return it.  If toRules() changes
1596     // with regard to spacing or ';', then adjust this string.
1597     UnicodeString rule("::Hex-Any;\n"
1598                        "::Any-Lower;\n"
1599                        "a > '.A.';\n"
1600                        "b > '.B.';\n"
1601                        "::[^t]Any-Upper;", "");
1602     UParseError parseError;
1603     UErrorCode status = U_ZERO_ERROR;
1604     Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, parseError, status);
1605     if (t == 0) {
1606         errln("FAIL: createFromRules failed");
1607         return;
1608     }
1609     expect(*t, UNICODE_STRING_SIMPLE("\\u0043at in the hat, bat on the mat"),
1610            "C.A.t IN tHE H.A.t, .B..A.t ON tHE M.A.t");
1611     UnicodeString r;
1612     t->toRules(r, true);
1613     if (r == rule) {
1614         logln((UnicodeString)"OK: toRules() => " + r);
1615     } else {
1616         errln((UnicodeString)"FAIL: toRules() => " + r +
1617               ", expected " + rule);
1618     }
1619     delete t;
1620 
1621     // Now test toRules
1622     t = Transliterator::createInstance("Greek-Latin; Latin-Cyrillic", UTRANS_FORWARD, parseError, status);
1623     if (t == 0) {
1624         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1625         return;
1626     }
1627     UnicodeString exp("::Greek-Latin;\n::Latin-Cyrillic;");
1628     t->toRules(r, true);
1629     if (r != exp) {
1630         errln((UnicodeString)"FAIL: toRules() => " + r +
1631               ", expected " + exp);
1632     } else {
1633         logln((UnicodeString)"OK: toRules() => " + r);
1634     }
1635     delete t;
1636 
1637     // Round trip the result of toRules
1638     t = Transliterator::createFromRules("Test", r, UTRANS_FORWARD, parseError, status);
1639     if (t == 0) {
1640         errln("FAIL: createFromRules #2 failed");
1641         return;
1642     } else {
1643         logln((UnicodeString)"OK: createFromRules(" + r + ") succeeded");
1644     }
1645 
1646     // Test toRules again
1647     t->toRules(r, true);
1648     if (r != exp) {
1649         errln((UnicodeString)"FAIL: toRules() => " + r +
1650               ", expected " + exp);
1651     } else {
1652         logln((UnicodeString)"OK: toRules() => " + r);
1653     }
1654 
1655     delete t;
1656 
1657     // Test Foo(Bar) IDs.  Careful with spacing in id; make it conform
1658     // to what the regenerated ID will look like.
1659     UnicodeString id("Upper(Lower);(NFKC)", "");
1660     t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
1661     if (t == 0) {
1662         errln("FAIL: createInstance #2 failed");
1663         return;
1664     }
1665     if (t->getID() == id) {
1666         logln((UnicodeString)"OK: created " + id);
1667     } else {
1668         errln((UnicodeString)"FAIL: createInstance(" + id +
1669               ").getID() => " + t->getID());
1670     }
1671 
1672     Transliterator *u = t->createInverse(status);
1673     if (u == 0) {
1674         errln("FAIL: createInverse failed");
1675         delete t;
1676         return;
1677     }
1678     exp = "NFKC();Lower(Upper)";
1679     if (u->getID() == exp) {
1680         logln((UnicodeString)"OK: createInverse(" + id + ") => " +
1681               u->getID());
1682     } else {
1683         errln((UnicodeString)"FAIL: createInverse(" + id + ") => " +
1684               u->getID());
1685     }
1686     delete t;
1687     delete u;
1688 }
1689 
1690 /**
1691  * Compound filter semantics were originally not implemented
1692  * correctly.  Originally, each component filter f(i) is replaced by
1693  * f'(i) = f(i) && g, where g is the filter for the compound
1694  * transliterator.
1695  *
1696  * From Mark:
1697  *
1698  * Suppose and I have a transliterator X. Internally X is
1699  * "Greek-Latin; Latin-Cyrillic; Any-Lower". I use a filter [^A].
1700  *
1701  * The compound should convert all greek characters (through latin) to
1702  * cyrillic, then lowercase the result. The filter should say "don't
1703  * touch 'A' in the original". But because an intermediate result
1704  * happens to go through "A", the Greek Alpha gets hung up.
1705  */
TestCompoundFilter(void)1706 void TransliteratorTest::TestCompoundFilter(void) {
1707     UParseError parseError;
1708     UErrorCode status = U_ZERO_ERROR;
1709     Transliterator *t = Transliterator::createInstance
1710         ("Greek-Latin; Latin-Greek; Lower", UTRANS_FORWARD, parseError, status);
1711     if (t == 0) {
1712         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1713         return;
1714     }
1715     t->adoptFilter(new UnicodeSet("[^A]", status));
1716     if (U_FAILURE(status)) {
1717         errln("FAIL: UnicodeSet ct failed");
1718         delete t;
1719         return;
1720     }
1721 
1722     // Only the 'A' at index 1 should remain unchanged
1723     expect(*t,
1724            CharsToUnicodeString("BA\\u039A\\u0391"),
1725            CharsToUnicodeString("\\u03b2A\\u03ba\\u03b1"));
1726     delete t;
1727 }
1728 
TestRemove(void)1729 void TransliteratorTest::TestRemove(void) {
1730     UParseError parseError;
1731     UErrorCode status = U_ZERO_ERROR;
1732     Transliterator *t = Transliterator::createInstance("Remove[abc]", UTRANS_FORWARD, parseError, status);
1733     if (t == 0) {
1734         errln("FAIL: createInstance failed");
1735         return;
1736     }
1737 
1738     expect(*t, "Able bodied baker's cats", "Ale odied ker's ts");
1739 
1740     // extra test for RemoveTransliterator::clone(), which at one point wasn't
1741     // duplicating the filter
1742     Transliterator* t2 = t->clone();
1743     expect(*t2, "Able bodied baker's cats", "Ale odied ker's ts");
1744 
1745     delete t;
1746     delete t2;
1747 }
1748 
TestToRules(void)1749 void TransliteratorTest::TestToRules(void) {
1750     const char* RBT = "rbt";
1751     const char* SET = "set";
1752     static const char* DATA[] = {
1753         RBT,
1754         "$a=\\u4E61; [$a] > A;",
1755         "[\\u4E61] > A;",
1756 
1757         RBT,
1758         "$white=[[:Zs:][:Zl:]]; $white{a} > A;",
1759         "[[:Zs:][:Zl:]]{a} > A;",
1760 
1761         SET,
1762         "[[:Zs:][:Zl:]]",
1763         "[[:Zs:][:Zl:]]",
1764 
1765         SET,
1766         "[:Ps:]",
1767         "[:Ps:]",
1768 
1769         SET,
1770         "[:L:]",
1771         "[:L:]",
1772 
1773         SET,
1774         "[[:L:]-[A]]",
1775         "[[:L:]-[A]]",
1776 
1777         SET,
1778         "[~[:Lu:][:Ll:]]",
1779         "[~[:Lu:][:Ll:]]",
1780 
1781         SET,
1782         "[~[a-z]]",
1783         "[~[a-z]]",
1784 
1785         RBT,
1786         "$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
1787         "[^[:Zs:]]{a} > A;",
1788 
1789         RBT,
1790         "$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
1791         "[[a-z]-[:Zs:]]{a} > A;",
1792 
1793         RBT,
1794         "$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
1795         "[[:Zs:]&[a-z]]{a} > A;",
1796 
1797         RBT,
1798         "$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
1799         "[x[:Zs:]]{a} > A;",
1800 
1801         RBT,
1802         "$accentMinus = [ [\\u0300-\\u0345] & [:M:] - [\\u0338]] ;"
1803         "$macron = \\u0304 ;"
1804         "$evowel = [aeiouyAEIOUY] ;"
1805         "$iotasub = \\u0345 ;"
1806         "($evowel $macron $accentMinus *) i > | $1 $iotasub ;",
1807         "([AEIOUYaeiouy]\\u0304[[\\u0300-\\u0345]&[:M:]-[\\u0338]]*)i > | $1 \\u0345;",
1808 
1809         RBT,
1810         "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1811         "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1812     };
1813     static const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1814 
1815     for (int32_t d=0; d < DATA_length; d+=3) {
1816         if (DATA[d] == RBT) {
1817             // Transliterator test
1818             UParseError parseError;
1819             UErrorCode status = U_ZERO_ERROR;
1820             Transliterator *t = Transliterator::createFromRules("ID",
1821                                                                 UnicodeString(DATA[d+1], -1, US_INV), UTRANS_FORWARD, parseError, status);
1822             if (t == 0) {
1823                 dataerrln("FAIL: createFromRules failed - %s", u_errorName(status));
1824                 return;
1825             }
1826             UnicodeString rules, escapedRules;
1827             t->toRules(rules, false);
1828             t->toRules(escapedRules, true);
1829             UnicodeString expRules = CharsToUnicodeString(DATA[d+2]);
1830             UnicodeString expEscapedRules(DATA[d+2], -1, US_INV);
1831             if (rules == expRules) {
1832                 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1833                       " => " + rules);
1834             } else {
1835                 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1836                       " => " + rules + ", exp " + expRules);
1837             }
1838             if (escapedRules == expEscapedRules) {
1839                 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1840                       " => " + escapedRules);
1841             } else {
1842                 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1843                       " => " + escapedRules + ", exp " + expEscapedRules);
1844             }
1845             delete t;
1846 
1847         } else {
1848             // UnicodeSet test
1849             UErrorCode status = U_ZERO_ERROR;
1850             UnicodeString pat(DATA[d+1], -1, US_INV);
1851             UnicodeString expToPat(DATA[d+2], -1, US_INV);
1852             UnicodeSet set(pat, status);
1853             if (U_FAILURE(status)) {
1854                 errln("FAIL: UnicodeSet ct failed");
1855                 return;
1856             }
1857             // Adjust spacing etc. as necessary.
1858             UnicodeString toPat;
1859             set.toPattern(toPat);
1860             if (expToPat == toPat) {
1861                 logln((UnicodeString)"Ok: " + pat +
1862                       " => " + toPat);
1863             } else {
1864                 errln((UnicodeString)"FAIL: " + pat +
1865                       " => " + prettify(toPat, true) +
1866                       ", exp " + prettify(pat, true));
1867             }
1868         }
1869     }
1870 }
1871 
TestContext()1872 void TransliteratorTest::TestContext() {
1873     UTransPosition pos = {0, 2, 0, 1}; // cs cl s l
1874     expect("de > x; {d}e > y;",
1875            "de",
1876            "ye",
1877            &pos);
1878 
1879     expect("ab{c} > z;",
1880            "xadabdabcy",
1881            "xadabdabzy");
1882 }
1883 
TestSupplemental()1884 void TransliteratorTest::TestSupplemental() {
1885 
1886     expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];"
1887                                 "a > $a; $s > i;"),
1888            CharsToUnicodeString("ab\\U0001030Fx"),
1889            CharsToUnicodeString("\\U00010300bix"));
1890 
1891     expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];"
1892                                 "$b=[A-Z\\U00010400-\\U0001044D];"
1893                                 "($a)($b) > $2 $1;"),
1894            CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
1895            CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
1896 
1897     // k|ax\\U00010300xm
1898 
1899     // k|a\\U00010400\\U00010300xm
1900     // ky|\\U00010400\\U00010300xm
1901     // ky\\U00010400|\\U00010300xm
1902 
1903     // ky\\U00010400|\\U00010300\\U00010400m
1904     // ky\\U00010400y|\\U00010400m
1905     expect(CharsToUnicodeString("$a=[a\\U00010300-\\U00010323];"
1906                                 "$a {x} > | @ \\U00010400;"
1907                                 "{$a} [^\\u0000-\\uFFFF] > y;"),
1908            CharsToUnicodeString("kax\\U00010300xm"),
1909            CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
1910 
1911     expectT("Any-Name",
1912            CharsToUnicodeString("\\U00010330\\U000E0061\\u00A0"),
1913            UNICODE_STRING_SIMPLE("\\N{GOTHIC LETTER AHSA}\\N{TAG LATIN SMALL LETTER A}\\N{NO-BREAK SPACE}"));
1914 
1915     expectT("Any-Hex/Unicode",
1916            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1917            UNICODE_STRING_SIMPLE("U+10330U+10FF00U+E0061U+00A0"));
1918 
1919     expectT("Any-Hex/C",
1920            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1921            UNICODE_STRING_SIMPLE("\\U00010330\\U0010FF00\\U000E0061\\u00A0"));
1922 
1923     expectT("Any-Hex/Perl",
1924            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1925            UNICODE_STRING_SIMPLE("\\x{10330}\\x{10FF00}\\x{E0061}\\x{A0}"));
1926 
1927     expectT("Any-Hex/Java",
1928            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1929            UNICODE_STRING_SIMPLE("\\uD800\\uDF30\\uDBFF\\uDF00\\uDB40\\uDC61\\u00A0"));
1930 
1931     expectT("Any-Hex/XML",
1932            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1933            "&#x10330;&#x10FF00;&#xE0061;&#xA0;");
1934 
1935     expectT("Any-Hex/XML10",
1936            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1937            "&#66352;&#1113856;&#917601;&#160;");
1938 
1939     expectT(UNICODE_STRING_SIMPLE("[\\U000E0000-\\U000E0FFF] Remove"),
1940            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1941            CharsToUnicodeString("\\U00010330\\U0010FF00\\u00A0"));
1942 }
1943 
TestQuantifier()1944 void TransliteratorTest::TestQuantifier() {
1945 
1946     // Make sure @ in a quantified anteContext works
1947     expect("a+ {b} > | @@ c; A > a; (a+ c) > '(' $1 ')';",
1948            "AAAAAb",
1949            "aaa(aac)");
1950 
1951     // Make sure @ in a quantified postContext works
1952     expect("{b} a+ > c @@ |; (a+) > '(' $1 ')';",
1953            "baaaaa",
1954            "caa(aaa)");
1955 
1956     // Make sure @ in a quantified postContext with seg ref works
1957     expect("{(b)} a+ > $1 @@ |; (a+) > '(' $1 ')';",
1958            "baaaaa",
1959            "baa(aaa)");
1960 
1961     // Make sure @ past ante context doesn't enter ante context
1962     UTransPosition pos = {0, 5, 3, 5};
1963     expect("a+ {b} > | @@ c; x > y; (a+ c) > '(' $1 ')';",
1964            "xxxab",
1965            "xxx(ac)",
1966            &pos);
1967 
1968     // Make sure @ past post context doesn't pass limit
1969     UTransPosition pos2 = {0, 4, 0, 2};
1970     expect("{b} a+ > c @@ |; x > y; a > A;",
1971            "baxx",
1972            "caxx",
1973            &pos2);
1974 
1975     // Make sure @ past post context doesn't enter post context
1976     expect("{b} a+ > c @@ |; x > y; a > A;",
1977            "baxx",
1978            "cayy");
1979 
1980     expect("(ab)? c > d;",
1981            "c abc ababc",
1982            "d d abd");
1983 
1984     // NOTE: The (ab)+ when referenced just yields a single "ab",
1985     // not the full sequence of them.  This accords with perl behavior.
1986     expect("(ab)+ {x} > '(' $1 ')';",
1987            "x abx ababxy",
1988            "x ab(ab) abab(ab)y");
1989 
1990     expect("b+ > x;",
1991            "ac abc abbc abbbc",
1992            "ac axc axc axc");
1993 
1994     expect("[abc]+ > x;",
1995            "qac abrc abbcs abtbbc",
1996            "qx xrx xs xtx");
1997 
1998     expect("q{(ab)+} > x;",
1999            "qa qab qaba qababc qaba",
2000            "qa qx qxa qxc qxa");
2001 
2002     expect("q(ab)* > x;",
2003            "qa qab qaba qababc",
2004            "xa x xa xc");
2005 
2006     // NOTE: The (ab)+ when referenced just yields a single "ab",
2007     // not the full sequence of them.  This accords with perl behavior.
2008     expect("q(ab)* > '(' $1 ')';",
2009            "qa qab qaba qababc",
2010            "()a (ab) (ab)a (ab)c");
2011 
2012     // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
2013     // quoted string
2014     expect("'ab'+ > x;",
2015            "bb ab ababb",
2016            "bb x xb");
2017 
2018     // $foo+ and $foo* -- the quantifier should apply to the entire
2019     // variable reference
2020     expect("$var = ab; $var+ > x;",
2021            "bb ab ababb",
2022            "bb x xb");
2023 }
2024 
2025 class TestTrans : public Transliterator {
2026 public:
TestTrans(const UnicodeString & id)2027     TestTrans(const UnicodeString& id) : Transliterator(id, 0) {
2028     }
clone(void) const2029     virtual TestTrans* clone(void) const override {
2030         return new TestTrans(getID());
2031     }
handleTransliterate(Replaceable &,UTransPosition & offsets,UBool) const2032     virtual void handleTransliterate(Replaceable& /*text*/, UTransPosition& offsets,
2033         UBool /*isIncremental*/) const override
2034     {
2035         offsets.start = offsets.limit;
2036     }
2037     virtual UClassID getDynamicClassID() const override;
2038     static UClassID U_EXPORT2 getStaticClassID();
2039 };
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)2040 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)
2041 
2042 /**
2043  * Test Source-Target/Variant.
2044  */
2045 void TransliteratorTest::TestSTV(void) {
2046     int32_t ns = Transliterator::countAvailableSources();
2047     if (ns < 0 || ns > 255) {
2048         errln((UnicodeString)"FAIL: Bad source count: " + ns);
2049         return;
2050     }
2051     int32_t i, j;
2052     for (i=0; i<ns; ++i) {
2053         UnicodeString source;
2054         Transliterator::getAvailableSource(i, source);
2055         logln((UnicodeString)"" + i + ": " + source);
2056         if (source.length() == 0) {
2057             errln("FAIL: empty source");
2058             continue;
2059         }
2060         int32_t nt = Transliterator::countAvailableTargets(source);
2061         if (nt < 0 || nt > 255) {
2062             errln((UnicodeString)"FAIL: Bad target count: " + nt);
2063             continue;
2064         }
2065         for (int32_t j=0; j<nt; ++j) {
2066             UnicodeString target;
2067             Transliterator::getAvailableTarget(j, source, target);
2068             logln((UnicodeString)" " + j + ": " + target);
2069             if (target.length() == 0) {
2070                 errln("FAIL: empty target");
2071                 continue;
2072             }
2073             int32_t nv = Transliterator::countAvailableVariants(source, target);
2074             if (nv < 0 || nv > 255) {
2075                 errln((UnicodeString)"FAIL: Bad variant count: " + nv);
2076                 continue;
2077             }
2078             for (int32_t k=0; k<nv; ++k) {
2079                 UnicodeString variant;
2080                 Transliterator::getAvailableVariant(k, source, target, variant);
2081                 if (variant.length() == 0) {
2082                     logln((UnicodeString)"  " + k + ": <empty>");
2083                 } else {
2084                     logln((UnicodeString)"  " + k + ": " + variant);
2085                 }
2086             }
2087         }
2088     }
2089 
2090     // Test registration
2091     const char* IDS[] = { "Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2092     const char* FULL_IDS[] = { "Any-Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2093     const char* SOURCES[] = { NULL, "Seoridf", "Oewoir" };
2094     for (i=0; i<3; ++i) {
2095         Transliterator *t = new TestTrans(IDS[i]);
2096         if (t == 0) {
2097             errln("FAIL: out of memory");
2098             return;
2099         }
2100         if (t->getID() != IDS[i]) {
2101             errln((UnicodeString)"FAIL: ID mismatch for " + IDS[i]);
2102             delete t;
2103             return;
2104         }
2105         Transliterator::registerInstance(t);
2106         UErrorCode status = U_ZERO_ERROR;
2107         t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2108         if (t == NULL) {
2109             errln((UnicodeString)"FAIL: Registration/creation failed for ID " +
2110                   IDS[i]);
2111         } else {
2112             logln((UnicodeString)"Ok: Registration/creation succeeded for ID " +
2113                   IDS[i]);
2114             delete t;
2115         }
2116         Transliterator::unregister(IDS[i]);
2117         t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2118         if (t != NULL) {
2119             errln((UnicodeString)"FAIL: Unregistration failed for ID " +
2120                   IDS[i]);
2121             delete t;
2122         }
2123     }
2124 
2125     // Make sure getAvailable API reflects removal
2126     int32_t n = Transliterator::countAvailableIDs();
2127     for (i=0; i<n; ++i) {
2128         UnicodeString id = Transliterator::getAvailableID(i);
2129         for (j=0; j<3; ++j) {
2130             if (id.caseCompare(FULL_IDS[j],0)==0) {
2131                 errln((UnicodeString)"FAIL: unregister(" + id + ") failed");
2132             }
2133         }
2134     }
2135     n = Transliterator::countAvailableTargets("Any");
2136     for (i=0; i<n; ++i) {
2137         UnicodeString t;
2138         Transliterator::getAvailableTarget(i, "Any", t);
2139         if (t.caseCompare(IDS[0],0)==0) {
2140             errln((UnicodeString)"FAIL: unregister(Any-" + t + ") failed");
2141         }
2142     }
2143     n = Transliterator::countAvailableSources();
2144     for (i=0; i<n; ++i) {
2145         UnicodeString s;
2146         Transliterator::getAvailableSource(i, s);
2147         for (j=0; j<3; ++j) {
2148             if (SOURCES[j] == NULL) continue;
2149             if (s.caseCompare(SOURCES[j],0)==0) {
2150                 errln((UnicodeString)"FAIL: unregister(" + s + "-*) failed");
2151             }
2152         }
2153     }
2154 }
2155 
2156 /**
2157  * Test inverse of Greek-Latin; Title()
2158  */
TestCompoundInverse(void)2159 void TransliteratorTest::TestCompoundInverse(void) {
2160     UParseError parseError;
2161     UErrorCode status = U_ZERO_ERROR;
2162     Transliterator *t = Transliterator::createInstance
2163         ("Greek-Latin; Title()", UTRANS_REVERSE,parseError, status);
2164     if (t == 0) {
2165         dataerrln("FAIL: createInstance - %s", u_errorName(status));
2166         return;
2167     }
2168     UnicodeString exp("(Title);Latin-Greek");
2169     if (t->getID() == exp) {
2170         logln("Ok: inverse of \"Greek-Latin; Title()\" is \"" +
2171               t->getID());
2172     } else {
2173         errln("FAIL: inverse of \"Greek-Latin; Title()\" is \"" +
2174               t->getID() + "\", expected \"" + exp + "\"");
2175     }
2176     delete t;
2177 }
2178 
2179 /**
2180  * Test NFD chaining with RBT
2181  */
TestNFDChainRBT()2182 void TransliteratorTest::TestNFDChainRBT() {
2183     UParseError pe;
2184     UErrorCode ec = U_ZERO_ERROR;
2185     Transliterator* t = Transliterator::createFromRules(
2186                                "TEST", "::NFD; aa > Q; a > q;",
2187                                UTRANS_FORWARD, pe, ec);
2188     if (t == NULL || U_FAILURE(ec)) {
2189         dataerrln("FAIL: Transliterator::createFromRules failed with %s", u_errorName(ec));
2190         return;
2191     }
2192     expect(*t, "aa", "Q");
2193     delete t;
2194 
2195     // TEMPORARY TESTS -- BEING DEBUGGED
2196 //=-    UnicodeString s, s2;
2197 //=-    t = Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, pe, ec);
2198 //=-    s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2199 //=-    s2 = CharsToUnicodeString("\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D");
2200 //=-    expect(*t, s, s2);
2201 //=-    delete t;
2202 //=-
2203 //=-    t = Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2204 //=-    expect(*t, s2, s);
2205 //=-    delete t;
2206 //=-
2207 //=-    t = Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2208 //=-    s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2209 //=-    expect(*t, s, s);
2210 //=-    delete t;
2211 
2212 //    const char* source[] = {
2213 //        /*
2214 //        "\\u015Br\\u012Bmad",
2215 //        "bhagavadg\\u012Bt\\u0101",
2216 //        "adhy\\u0101ya",
2217 //        "arjuna",
2218 //        "vi\\u1E63\\u0101da",
2219 //        "y\\u014Dga",
2220 //        "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2221 //        "uv\\u0101cr\\u0325",
2222 //        */
2223 //        "rmk\\u1E63\\u0113t",
2224 //      //"dharmak\\u1E63\\u0113tr\\u0113",
2225 //        /*
2226 //        "kuruk\\u1E63\\u0113tr\\u0113",
2227 //        "samav\\u0113t\\u0101",
2228 //        "yuyutsava-\\u1E25",
2229 //        "m\\u0101mak\\u0101-\\u1E25",
2230 //     // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2231 //        "kimakurvata",
2232 //        "san\\u0304java",
2233 //        */
2234 //
2235 //        0
2236 //    };
2237 //    const char* expected[] = {
2238 //        /*
2239 //        "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2240 //        "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2241 //        "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2242 //        "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2243 //        "\\u0935\\u093f\\u0937\\u093e\\u0926",
2244 //        "\\u092f\\u094b\\u0917",
2245 //        "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2246 //        "\\u0909\\u0935\\u093E\\u091A\\u0943",
2247 //        */
2248 //        "\\u0927",
2249 //        //"\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2250 //        /*
2251 //        "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2252 //        "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2253 //        "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2254 //        "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2255 //    //  "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2256 //        "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2257 //        "\\u0938\\u0902\\u091c\\u0935",
2258 //        */
2259 //        0
2260 //    };
2261 //    UErrorCode status = U_ZERO_ERROR;
2262 //    UParseError parseError;
2263 //    UnicodeString message;
2264 //    Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2265 //    Transliterator* devToLatinToDev=Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2266 //    if(U_FAILURE(status)){
2267 //        errln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2268 //        errln("PreContext: " + prettify(parseError.preContext) + "PostContext: " + prettify( parseError.postContext) );
2269 //        delete latinToDevToLatin;
2270 //        delete devToLatinToDev;
2271 //        return;
2272 //    }
2273 //    UnicodeString gotResult;
2274 //    for(int i= 0; source[i] != 0; i++){
2275 //        gotResult = source[i];
2276 //        expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2277 //        expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2278 //    }
2279 //    delete latinToDevToLatin;
2280 //    delete devToLatinToDev;
2281 }
2282 
2283 /**
2284  * Inverse of "Null" should be "Null". (J21)
2285  */
TestNullInverse()2286 void TransliteratorTest::TestNullInverse() {
2287     UParseError pe;
2288     UErrorCode ec = U_ZERO_ERROR;
2289     Transliterator *t = Transliterator::createInstance("Null", UTRANS_FORWARD, pe, ec);
2290     if (t == 0 || U_FAILURE(ec)) {
2291         errln("FAIL: createInstance");
2292         return;
2293     }
2294     Transliterator *u = t->createInverse(ec);
2295     if (u == 0 || U_FAILURE(ec)) {
2296         errln("FAIL: createInverse");
2297         delete t;
2298         return;
2299     }
2300     if (u->getID() != "Null") {
2301         errln("FAIL: Inverse of Null should be Null");
2302     }
2303     delete t;
2304     delete u;
2305 }
2306 
2307 /**
2308  * Check ID of inverse of alias. (J22)
2309  */
TestAliasInverseID()2310 void TransliteratorTest::TestAliasInverseID() {
2311     UnicodeString ID("Latin-Hangul", ""); // This should be any alias ID with an inverse
2312     UParseError pe;
2313     UErrorCode ec = U_ZERO_ERROR;
2314     Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2315     if (t == 0 || U_FAILURE(ec)) {
2316         dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2317         return;
2318     }
2319     Transliterator *u = t->createInverse(ec);
2320     if (u == 0 || U_FAILURE(ec)) {
2321         errln("FAIL: createInverse");
2322         delete t;
2323         return;
2324     }
2325     UnicodeString exp = "Hangul-Latin";
2326     UnicodeString got = u->getID();
2327     if (got != exp) {
2328         errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2329               ", expected " + exp);
2330     }
2331     delete t;
2332     delete u;
2333 }
2334 
2335 /**
2336  * Test IDs of inverses of compound transliterators. (J20)
2337  */
TestCompoundInverseID()2338 void TransliteratorTest::TestCompoundInverseID() {
2339     UnicodeString ID = "Latin-Jamo;NFC(NFD)";
2340     UParseError pe;
2341     UErrorCode ec = U_ZERO_ERROR;
2342     Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2343     if (t == 0 || U_FAILURE(ec)) {
2344         dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2345         return;
2346     }
2347     Transliterator *u = t->createInverse(ec);
2348     if (u == 0 || U_FAILURE(ec)) {
2349         errln("FAIL: createInverse");
2350         delete t;
2351         return;
2352     }
2353     UnicodeString exp = "NFD(NFC);Jamo-Latin";
2354     UnicodeString got = u->getID();
2355     if (got != exp) {
2356         errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2357               ", expected " + exp);
2358     }
2359     delete t;
2360     delete u;
2361 }
2362 
2363 /**
2364  * Test undefined variable.
2365 
2366  */
TestUndefinedVariable()2367 void TransliteratorTest::TestUndefinedVariable() {
2368     UnicodeString rule = "$initial } a <> \\u1161;";
2369     UParseError pe;
2370     UErrorCode ec = U_ZERO_ERROR;
2371     Transliterator *t = Transliterator::createFromRules("<ID>", rule, UTRANS_FORWARD, pe, ec);
2372     delete t;
2373     if (U_FAILURE(ec)) {
2374         logln((UnicodeString)"OK: Got exception for " + rule + ", as expected: " +
2375               u_errorName(ec));
2376         return;
2377     }
2378     errln((UnicodeString)"Fail: bogus rule " + rule + " compiled with error " +
2379           u_errorName(ec));
2380 }
2381 
2382 /**
2383  * Test empty context.
2384  */
TestEmptyContext()2385 void TransliteratorTest::TestEmptyContext() {
2386     expect(" { a } > b;", "xay a ", "xby b ");
2387 }
2388 
2389 /**
2390 * Test compound filter ID syntax
2391 */
TestCompoundFilterID(void)2392 void TransliteratorTest::TestCompoundFilterID(void) {
2393     static const char* DATA[] = {
2394         // Col. 1 = ID or rule set (latter must start with #)
2395 
2396         // = columns > 1 are null if expect col. 1 to be illegal =
2397 
2398         // Col. 2 = direction, "F..." or "R..."
2399         // Col. 3 = source string
2400         // Col. 4 = exp result
2401 
2402         "[abc]; [abc]", NULL, NULL, NULL, // multiple filters
2403         "Latin-Greek; [abc];", NULL, NULL, NULL, // misplaced filter
2404         "[b]; Latin-Greek; Upper; ([xyz])", "F", "abc", "a\\u0392c",
2405         "[b]; (Lower); Latin-Greek; Upper(); ([\\u0392])", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2406         "#\n::[b]; ::Latin-Greek; ::Upper; ::([xyz]);", "F", "abc", "a\\u0392c",
2407         "#\n::[b]; ::(Lower); ::Latin-Greek; ::Upper(); ::([\\u0392]);", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2408         NULL,
2409     };
2410 
2411     for (int32_t i=0; DATA[i]; i+=4) {
2412         UnicodeString id = CharsToUnicodeString(DATA[i]);
2413         UTransDirection direction = (DATA[i+1] != NULL && DATA[i+1][0] == 'R') ?
2414             UTRANS_REVERSE : UTRANS_FORWARD;
2415         UnicodeString source;
2416         UnicodeString exp;
2417         if (DATA[i+2] != NULL) {
2418             source = CharsToUnicodeString(DATA[i+2]);
2419             exp = CharsToUnicodeString(DATA[i+3]);
2420         }
2421         UBool expOk = (DATA[i+1] != NULL);
2422         LocalPointer<Transliterator> t;
2423         UParseError pe;
2424         UErrorCode ec = U_ZERO_ERROR;
2425         if (id.charAt(0) == 0x23/*#*/) {
2426             t.adoptInstead(Transliterator::createFromRules("ID", id, direction, pe, ec));
2427         } else {
2428             t.adoptInstead(Transliterator::createInstance(id, direction, pe, ec));
2429         }
2430         UBool ok = (t.isValid() && U_SUCCESS(ec));
2431         UnicodeString transID;
2432         if (t.isValid()) {
2433             transID = t->getID();
2434         }
2435         else {
2436             transID = UnicodeString("NULL", "");
2437         }
2438         if (ok == expOk) {
2439             logln((UnicodeString)"Ok: " + id + " => " + transID + ", " +
2440                   u_errorName(ec));
2441             if (source.length() != 0) {
2442                 expect(*t, source, exp);
2443             }
2444         } else {
2445             dataerrln((UnicodeString)"FAIL: " + id + " => " + transID + ", " +
2446                   u_errorName(ec));
2447         }
2448     }
2449 }
2450 
2451 /**
2452  * Test new property set syntax
2453  */
TestPropertySet()2454 void TransliteratorTest::TestPropertySet() {
2455     expect(UNICODE_STRING_SIMPLE("a>A; \\p{Lu}>x; \\p{ANY}>y;"), "abcDEF", "Ayyxxx");
2456     expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
2457            "[ a stitch ]\n[ in time ]\r[ saves 9]");
2458 }
2459 
2460 /**
2461  * Test various failure points of the new 2.0 engine.
2462  */
TestNewEngine()2463 void TransliteratorTest::TestNewEngine() {
2464     UParseError pe;
2465     UErrorCode ec = U_ZERO_ERROR;
2466     Transliterator *t = Transliterator::createInstance("Latin-Hiragana", UTRANS_FORWARD, pe, ec);
2467     if (t == 0 || U_FAILURE(ec)) {
2468         dataerrln("FAIL: createInstance Latin-Hiragana - %s", u_errorName(ec));
2469         return;
2470     }
2471     // Katakana should be untouched
2472     expect(*t, CharsToUnicodeString("a\\u3042\\u30A2"),
2473            CharsToUnicodeString("\\u3042\\u3042\\u30A2"));
2474 
2475     delete t;
2476 
2477 #if 1
2478     // This test will only work if Transliterator.ROLLBACK is
2479     // true.  Otherwise, this test will fail, revealing a
2480     // limitation of global filters in incremental mode.
2481     Transliterator *a =
2482         Transliterator::createFromRules("a_to_A", "a > A;", UTRANS_FORWARD, pe, ec);
2483     Transliterator *A =
2484         Transliterator::createFromRules("A_to_b", "A > b;", UTRANS_FORWARD, pe, ec);
2485     if (U_FAILURE(ec)) {
2486         delete a;
2487         delete A;
2488         return;
2489     }
2490 
2491     Transliterator* array[3];
2492     array[0] = a;
2493     array[1] = Transliterator::createInstance("NFD", UTRANS_FORWARD, pe, ec);
2494     array[2] = A;
2495     if (U_FAILURE(ec)) {
2496         errln("FAIL: createInstance NFD");
2497         delete a;
2498         delete A;
2499         delete array[1];
2500         return;
2501     }
2502 
2503     t = new CompoundTransliterator(array, 3, new UnicodeSet("[:Ll:]", ec));
2504     if (U_FAILURE(ec)) {
2505         errln("FAIL: UnicodeSet constructor");
2506         delete a;
2507         delete A;
2508         delete array[1];
2509         delete t;
2510         return;
2511     }
2512 
2513     expect(*t, "aAaA", "bAbA");
2514 
2515     assertTrue("countElements", t->countElements() == 3);
2516     assertEquals("getElement(0)", t->getElement(0, ec).getID(), "a_to_A");
2517     assertEquals("getElement(1)", t->getElement(1, ec).getID(), "NFD");
2518     assertEquals("getElement(2)", t->getElement(2, ec).getID(), "A_to_b");
2519     assertSuccess("getElement", ec);
2520 
2521     delete a;
2522     delete A;
2523     delete array[1];
2524     delete t;
2525 #endif
2526 
2527     expect("$smooth = x; $macron = q; [:^L:] { ([aeiouyAEIOUY] $macron?) } [^aeiouyAEIOUY$smooth$macron] > | $1 $smooth ;",
2528            "a",
2529            "ax");
2530 
2531     UnicodeString gr = CharsToUnicodeString(
2532         "$ddot = \\u0308 ;"
2533         "$lcgvowel = [\\u03b1\\u03b5\\u03b7\\u03b9\\u03bf\\u03c5\\u03c9] ;"
2534         "$rough = \\u0314 ;"
2535         "($lcgvowel+ $ddot?) $rough > h | $1 ;"
2536         "\\u03b1 <> a ;"
2537         "$rough <> h ;");
2538 
2539     expect(gr, CharsToUnicodeString("\\u03B1\\u0314"), "ha");
2540 }
2541 
2542 /**
2543  * Test quantified segment behavior.  We want:
2544  * ([abc])+ > x $1 x; applied to "cba" produces "xax"
2545  */
TestQuantifiedSegment(void)2546 void TransliteratorTest::TestQuantifiedSegment(void) {
2547     // The normal case
2548     expect("([abc]+) > x $1 x;", "cba", "xcbax");
2549 
2550     // The tricky case; the quantifier is around the segment
2551     expect("([abc])+ > x $1 x;", "cba", "xax");
2552 
2553     // Tricky case in reverse direction
2554     expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
2555 
2556     // Check post-context segment
2557     expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
2558 
2559     // Test toRule/toPattern for non-quantified segment.
2560     // Careful with spacing here.
2561     UnicodeString r("([a-c]){q} > x $1 x;");
2562     UParseError pe;
2563     UErrorCode ec = U_ZERO_ERROR;
2564     Transliterator* t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2565     if (U_FAILURE(ec)) {
2566         errln("FAIL: createFromRules");
2567         delete t;
2568         return;
2569     }
2570     UnicodeString rr;
2571     t->toRules(rr, true);
2572     if (r != rr) {
2573         errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2574     } else {
2575         logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2576     }
2577     delete t;
2578 
2579     // Test toRule/toPattern for quantified segment.
2580     // Careful with spacing here.
2581     r = "([a-c])+{q} > x $1 x;";
2582     t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2583     if (U_FAILURE(ec)) {
2584         errln("FAIL: createFromRules");
2585         delete t;
2586         return;
2587     }
2588     t->toRules(rr, true);
2589     if (r != rr) {
2590         errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2591     } else {
2592         logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2593     }
2594     delete t;
2595 }
2596 
2597 //======================================================================
2598 // Ram's tests
2599 //======================================================================
TestDevanagariLatinRT()2600 void TransliteratorTest::TestDevanagariLatinRT(){
2601     const int MAX_LEN= 52;
2602     const char* const source[MAX_LEN] = {
2603         "bh\\u0101rata",
2604         "kra",
2605         "k\\u1E63a",
2606         "khra",
2607         "gra",
2608         "\\u1E45ra",
2609         "cra",
2610         "chra",
2611         "j\\u00F1a",
2612         "jhra",
2613         "\\u00F1ra",
2614         "\\u1E6Dya",
2615         "\\u1E6Dhra",
2616         "\\u1E0Dya",
2617       //"r\\u0323ya", // \u095c is not valid in Devanagari
2618         "\\u1E0Dhya",
2619         "\\u1E5Bhra",
2620         "\\u1E47ra",
2621         "tta",
2622         "thra",
2623         "dda",
2624         "dhra",
2625         "nna",
2626         "pra",
2627         "phra",
2628         "bra",
2629         "bhra",
2630         "mra",
2631         "\\u1E49ra",
2632       //"l\\u0331ra",
2633         "yra",
2634         "\\u1E8Fra",
2635       //"l-",
2636         "vra",
2637         "\\u015Bra",
2638         "\\u1E63ra",
2639         "sra",
2640         "hma",
2641         "\\u1E6D\\u1E6Da",
2642         "\\u1E6D\\u1E6Dha",
2643         "\\u1E6Dh\\u1E6Dha",
2644         "\\u1E0D\\u1E0Da",
2645         "\\u1E0D\\u1E0Dha",
2646         "\\u1E6Dya",
2647         "\\u1E6Dhya",
2648         "\\u1E0Dya",
2649         "\\u1E0Dhya",
2650         // Not roundtrippable --
2651         // \\u0939\\u094d\\u094d\\u092E  - hma
2652         // \\u0939\\u094d\\u092E         - hma
2653         // CharsToUnicodeString("hma"),
2654         "hya",
2655         "\\u015Br\\u0325",
2656         "\\u015Bca",
2657         "\\u0115",
2658         "san\\u0304j\\u012Bb s\\u0113nagupta",
2659         "\\u0101nand vaddir\\u0101ju",
2660         "\\u0101",
2661         "a"
2662     };
2663     const char* const expected[MAX_LEN] = {
2664         "\\u092D\\u093E\\u0930\\u0924",   /* bha\\u0304rata */
2665         "\\u0915\\u094D\\u0930",          /* kra         */
2666         "\\u0915\\u094D\\u0937",          /* ks\\u0323a  */
2667         "\\u0916\\u094D\\u0930",          /* khra        */
2668         "\\u0917\\u094D\\u0930",          /* gra         */
2669         "\\u0919\\u094D\\u0930",          /* n\\u0307ra  */
2670         "\\u091A\\u094D\\u0930",          /* cra         */
2671         "\\u091B\\u094D\\u0930",          /* chra        */
2672         "\\u091C\\u094D\\u091E",          /* jn\\u0303a  */
2673         "\\u091D\\u094D\\u0930",          /* jhra        */
2674         "\\u091E\\u094D\\u0930",          /* n\\u0303ra  */
2675         "\\u091F\\u094D\\u092F",          /* t\\u0323ya  */
2676         "\\u0920\\u094D\\u0930",          /* t\\u0323hra */
2677         "\\u0921\\u094D\\u092F",          /* d\\u0323ya  */
2678       //"\\u095C\\u094D\\u092F",        /* r\\u0323ya  */ // \u095c is not valid in Devanagari
2679         "\\u0922\\u094D\\u092F",          /* d\\u0323hya */
2680         "\\u0922\\u093C\\u094D\\u0930",   /* r\\u0323hra */
2681         "\\u0923\\u094D\\u0930",          /* n\\u0323ra  */
2682         "\\u0924\\u094D\\u0924",          /* tta         */
2683         "\\u0925\\u094D\\u0930",          /* thra        */
2684         "\\u0926\\u094D\\u0926",          /* dda         */
2685         "\\u0927\\u094D\\u0930",          /* dhra        */
2686         "\\u0928\\u094D\\u0928",          /* nna         */
2687         "\\u092A\\u094D\\u0930",          /* pra         */
2688         "\\u092B\\u094D\\u0930",          /* phra        */
2689         "\\u092C\\u094D\\u0930",          /* bra         */
2690         "\\u092D\\u094D\\u0930",          /* bhra        */
2691         "\\u092E\\u094D\\u0930",          /* mra         */
2692         "\\u0929\\u094D\\u0930",          /* n\\u0331ra  */
2693       //"\\u0934\\u094D\\u0930",        /* l\\u0331ra  */
2694         "\\u092F\\u094D\\u0930",          /* yra         */
2695         "\\u092F\\u093C\\u094D\\u0930",   /* y\\u0307ra  */
2696       //"l-",
2697         "\\u0935\\u094D\\u0930",          /* vra         */
2698         "\\u0936\\u094D\\u0930",          /* s\\u0301ra  */
2699         "\\u0937\\u094D\\u0930",          /* s\\u0323ra  */
2700         "\\u0938\\u094D\\u0930",          /* sra         */
2701         "\\u0939\\u094d\\u092E",          /* hma         */
2702         "\\u091F\\u094D\\u091F",          /* t\\u0323t\\u0323a  */
2703         "\\u091F\\u094D\\u0920",          /* t\\u0323t\\u0323ha */
2704         "\\u0920\\u094D\\u0920",          /* t\\u0323ht\\u0323ha*/
2705         "\\u0921\\u094D\\u0921",          /* d\\u0323d\\u0323a  */
2706         "\\u0921\\u094D\\u0922",          /* d\\u0323d\\u0323ha */
2707         "\\u091F\\u094D\\u092F",          /* t\\u0323ya  */
2708         "\\u0920\\u094D\\u092F",          /* t\\u0323hya */
2709         "\\u0921\\u094D\\u092F",          /* d\\u0323ya  */
2710         "\\u0922\\u094D\\u092F",          /* d\\u0323hya */
2711      // "hma",                         /* hma         */
2712         "\\u0939\\u094D\\u092F",          /* hya         */
2713         "\\u0936\\u0943",                 /* s\\u0301r\\u0325a  */
2714         "\\u0936\\u094D\\u091A",          /* s\\u0301ca  */
2715         "\\u090d",                        /* e\\u0306    */
2716         "\\u0938\\u0902\\u091C\\u0940\\u092C\\u094D \\u0938\\u0947\\u0928\\u0917\\u0941\\u092A\\u094D\\u0924",
2717         "\\u0906\\u0928\\u0902\\u0926\\u094D \\u0935\\u0926\\u094D\\u0926\\u093F\\u0930\\u093E\\u091C\\u0941",
2718         "\\u0906",
2719         "\\u0905",
2720     };
2721     UErrorCode status = U_ZERO_ERROR;
2722     UParseError parseError;
2723     UnicodeString message;
2724     Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2725     Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2726     if(U_FAILURE(status)){
2727         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2728         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2729         return;
2730     }
2731     UnicodeString gotResult;
2732     for(int i= 0; i<MAX_LEN; i++){
2733         gotResult = source[i];
2734         expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2735         expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2736     }
2737     delete latinToDev;
2738     delete devToLatin;
2739 }
2740 
TestTeluguLatinRT()2741 void TransliteratorTest::TestTeluguLatinRT(){
2742     const int MAX_LEN=10;
2743     const char* const source[MAX_LEN] = {
2744         "raghur\\u0101m vi\\u015Bvan\\u0101dha",                         /* Raghuram Viswanadha    */
2745         "\\u0101nand vaddir\\u0101ju",                                   /* Anand Vaddiraju        */
2746         "r\\u0101j\\u012Bv ka\\u015Barab\\u0101da",                      /* Rajeev Kasarabada      */
2747         "san\\u0304j\\u012Bv ka\\u015Barab\\u0101da",                    /* sanjeev kasarabada     */
2748         "san\\u0304j\\u012Bb sen'gupta",                                 /* sanjib sengupata       */
2749         "amar\\u0113ndra hanum\\u0101nula",                              /* Amarendra hanumanula   */
2750         "ravi kum\\u0101r vi\\u015Bvan\\u0101dha",                       /* Ravi Kumar Viswanadha  */
2751         "\\u0101ditya kandr\\u0113gula",                                 /* Aditya Kandregula      */
2752         "\\u015Br\\u012Bdhar ka\\u1E47\\u1E6Dama\\u015Be\\u1E6D\\u1E6Di",/* Shridhar Kantamsetty   */
2753         "m\\u0101dhav de\\u015Be\\u1E6D\\u1E6Di"                         /* Madhav Desetty         */
2754     };
2755 
2756     const char* const expected[MAX_LEN] = {
2757         "\\u0c30\\u0c18\\u0c41\\u0c30\\u0c3e\\u0c2e\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2758         "\\u0c06\\u0c28\\u0c02\\u0c26\\u0c4d \\u0C35\\u0C26\\u0C4D\\u0C26\\u0C3F\\u0C30\\u0C3E\\u0C1C\\u0C41",
2759         "\\u0c30\\u0c3e\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2760         "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2761         "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c2c\\u0c4d \\u0c38\\u0c46\\u0c28\\u0c4d\\u0c17\\u0c41\\u0c2a\\u0c4d\\u0c24",
2762         "\\u0c05\\u0c2e\\u0c30\\u0c47\\u0c02\\u0c26\\u0c4d\\u0c30 \\u0c39\\u0c28\\u0c41\\u0c2e\\u0c3e\\u0c28\\u0c41\\u0c32",
2763         "\\u0c30\\u0c35\\u0c3f \\u0c15\\u0c41\\u0c2e\\u0c3e\\u0c30\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2764         "\\u0c06\\u0c26\\u0c3f\\u0c24\\u0c4d\\u0c2f \\u0C15\\u0C02\\u0C26\\u0C4D\\u0C30\\u0C47\\u0C17\\u0C41\\u0c32",
2765         "\\u0c36\\u0c4d\\u0c30\\u0c40\\u0C27\\u0C30\\u0C4D \\u0c15\\u0c02\\u0c1f\\u0c2e\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2766         "\\u0c2e\\u0c3e\\u0c27\\u0c35\\u0c4d \\u0c26\\u0c46\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2767     };
2768 
2769     UErrorCode status = U_ZERO_ERROR;
2770     UParseError parseError;
2771     UnicodeString message;
2772     Transliterator* latinToDev=Transliterator::createInstance("Latin-Telugu", UTRANS_FORWARD, parseError, status);
2773     Transliterator* devToLatin=Transliterator::createInstance("Telugu-Latin", UTRANS_FORWARD, parseError, status);
2774     if(U_FAILURE(status)){
2775         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2776         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2777         return;
2778     }
2779     UnicodeString gotResult;
2780     for(int i= 0; i<MAX_LEN; i++){
2781         gotResult = source[i];
2782         expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2783         expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2784     }
2785     delete latinToDev;
2786     delete devToLatin;
2787 }
2788 
TestSanskritLatinRT()2789 void TransliteratorTest::TestSanskritLatinRT(){
2790     const int MAX_LEN =16;
2791     const char* const source[MAX_LEN] = {
2792         "rmk\\u1E63\\u0113t",
2793         "\\u015Br\\u012Bmad",
2794         "bhagavadg\\u012Bt\\u0101",
2795         "adhy\\u0101ya",
2796         "arjuna",
2797         "vi\\u1E63\\u0101da",
2798         "y\\u014Dga",
2799         "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2800         "uv\\u0101cr\\u0325",
2801         "dharmak\\u1E63\\u0113tr\\u0113",
2802         "kuruk\\u1E63\\u0113tr\\u0113",
2803         "samav\\u0113t\\u0101",
2804         "yuyutsava\\u1E25",
2805         "m\\u0101mak\\u0101\\u1E25",
2806     // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2807         "kimakurvata",
2808         "san\\u0304java",
2809     };
2810     const char* const expected[MAX_LEN] = {
2811         "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2812         "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2813         "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2814         "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2815         "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2816         "\\u0935\\u093f\\u0937\\u093e\\u0926",
2817         "\\u092f\\u094b\\u0917",
2818         "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2819         "\\u0909\\u0935\\u093E\\u091A\\u0943",
2820         "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2821         "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2822         "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2823         "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2824         "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2825     //"\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2826         "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2827         "\\u0938\\u0902\\u091c\\u0935",
2828     };
2829     UErrorCode status = U_ZERO_ERROR;
2830     UParseError parseError;
2831     UnicodeString message;
2832     Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2833     Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2834     if(U_FAILURE(status)){
2835         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2836         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2837         return;
2838     }
2839     UnicodeString gotResult;
2840     for(int i= 0; i<MAX_LEN; i++){
2841         gotResult = source[i];
2842         expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2843         expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2844     }
2845     delete latinToDev;
2846     delete devToLatin;
2847 }
2848 
2849 
TestCompoundLatinRT()2850 void TransliteratorTest::TestCompoundLatinRT(){
2851     const char* const source[] = {
2852         "rmk\\u1E63\\u0113t",
2853         "\\u015Br\\u012Bmad",
2854         "bhagavadg\\u012Bt\\u0101",
2855         "adhy\\u0101ya",
2856         "arjuna",
2857         "vi\\u1E63\\u0101da",
2858         "y\\u014Dga",
2859         "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2860         "uv\\u0101cr\\u0325",
2861         "dharmak\\u1E63\\u0113tr\\u0113",
2862         "kuruk\\u1E63\\u0113tr\\u0113",
2863         "samav\\u0113t\\u0101",
2864         "yuyutsava\\u1E25",
2865         "m\\u0101mak\\u0101\\u1E25",
2866      // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2867         "kimakurvata",
2868         "san\\u0304java"
2869     };
2870     const int MAX_LEN = UPRV_LENGTHOF(source);
2871     const char* const expected[MAX_LEN] = {
2872         "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2873         "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2874         "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2875         "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2876         "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2877         "\\u0935\\u093f\\u0937\\u093e\\u0926",
2878         "\\u092f\\u094b\\u0917",
2879         "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2880         "\\u0909\\u0935\\u093E\\u091A\\u0943",
2881         "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2882         "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2883         "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2884         "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2885         "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2886     //  "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2887         "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2888         "\\u0938\\u0902\\u091c\\u0935"
2889     };
2890     if(MAX_LEN != UPRV_LENGTHOF(expected)) {
2891         errln("error in TestCompoundLatinRT: source[] and expected[] have different lengths!");
2892         return;
2893     }
2894 
2895     UErrorCode status = U_ZERO_ERROR;
2896     UParseError parseError;
2897     UnicodeString message;
2898     Transliterator* devToLatinToDev  =Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2899     Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2900     Transliterator* devToTelToDev    =Transliterator::createInstance("Devanagari-Telugu;Telugu-Devanagari", UTRANS_FORWARD, parseError, status);
2901     Transliterator* latinToTelToLatin=Transliterator::createInstance("Latin-Telugu;Telugu-Latin", UTRANS_FORWARD, parseError, status);
2902 
2903     if(U_FAILURE(status)){
2904         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2905         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2906         return;
2907     }
2908     UnicodeString gotResult;
2909     for(int i= 0; i<MAX_LEN; i++){
2910         gotResult = source[i];
2911         expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2912         expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2913         expect(*latinToTelToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2914 
2915     }
2916     delete(latinToDevToLatin);
2917     delete(devToLatinToDev);
2918     delete(devToTelToDev);
2919     delete(latinToTelToLatin);
2920 }
2921 
2922 /**
2923  * Test Gurmukhi-Devanagari Tippi and Bindi
2924  */
TestGurmukhiDevanagari()2925 void TransliteratorTest::TestGurmukhiDevanagari(){
2926     // the rule says:
2927     // (\u0902) (when preceded by vowel)      --->  (\u0A02)
2928     // (\u0902) (when preceded by consonant)  --->  (\u0A70)
2929     UErrorCode status = U_ZERO_ERROR;
2930     UnicodeSet vowel(UnicodeString("[\\u0905-\\u090A \\u090F\\u0910\\u0913\\u0914 \\u093e-\\u0942\\u0947\\u0948\\u094B\\u094C\\u094D]", -1, US_INV).unescape(), status);
2931     UnicodeSet non_vowel(UnicodeString("[\\u0915-\\u0928\\u092A-\\u0930]", -1, US_INV).unescape(), status);
2932     UParseError parseError;
2933 
2934     UnicodeSetIterator vIter(vowel);
2935     UnicodeSetIterator nvIter(non_vowel);
2936     Transliterator* trans = Transliterator::createInstance("Devanagari-Gurmukhi",UTRANS_FORWARD, parseError, status);
2937     if(U_FAILURE(status)) {
2938       dataerrln("Error creating transliterator %s", u_errorName(status));
2939       delete trans;
2940       return;
2941     }
2942     UnicodeString src (" \\u0902", -1, US_INV);
2943     UnicodeString expected(" \\u0A02", -1, US_INV);
2944     src = src.unescape();
2945     expected= expected.unescape();
2946 
2947     while(vIter.next()){
2948         src.setCharAt(0,(UChar) vIter.getCodepoint());
2949         expected.setCharAt(0,(UChar) (vIter.getCodepoint()+0x0100));
2950         expect(*trans,src,expected);
2951     }
2952 
2953     expected.setCharAt(1,0x0A70);
2954     while(nvIter.next()){
2955         //src.setCharAt(0,(char) nvIter.codepoint);
2956         src.setCharAt(0,(UChar)nvIter.getCodepoint());
2957         expected.setCharAt(0,(UChar) (nvIter.getCodepoint()+0x0100));
2958         expect(*trans,src,expected);
2959     }
2960     delete trans;
2961 }
2962 /**
2963  * Test instantiation from a locale.
2964  */
TestLocaleInstantiation(void)2965 void TransliteratorTest::TestLocaleInstantiation(void) {
2966     UParseError pe;
2967     UErrorCode ec = U_ZERO_ERROR;
2968     Transliterator *t = Transliterator::createInstance("ru_RU-Latin", UTRANS_FORWARD, pe, ec);
2969     if (U_FAILURE(ec)) {
2970         dataerrln("FAIL: createInstance(ru_RU-Latin) - %s", u_errorName(ec));
2971         delete t;
2972         return;
2973     }
2974     expect(*t, CharsToUnicodeString("\\u0430"), "a");
2975     delete t;
2976 
2977     t = Transliterator::createInstance("en-el", UTRANS_FORWARD, pe, ec);
2978     if (U_FAILURE(ec)) {
2979         errln("FAIL: createInstance(en-el)");
2980         delete t;
2981         return;
2982     }
2983     expect(*t, "a", CharsToUnicodeString("\\u03B1"));
2984     delete t;
2985 }
2986 
2987 /**
2988  * Test title case handling of accent (should ignore accents)
2989  */
TestTitleAccents(void)2990 void TransliteratorTest::TestTitleAccents(void) {
2991     UParseError pe;
2992     UErrorCode ec = U_ZERO_ERROR;
2993     Transliterator *t = Transliterator::createInstance("Title", UTRANS_FORWARD, pe, ec);
2994     if (U_FAILURE(ec)) {
2995         errln("FAIL: createInstance(Title)");
2996         delete t;
2997         return;
2998     }
2999     expect(*t, CharsToUnicodeString("a\\u0300b can't abe"), CharsToUnicodeString("A\\u0300b Can't Abe"));
3000     delete t;
3001 }
3002 
3003 /**
3004  * Basic test of a locale resource based rule.
3005  */
TestLocaleResource()3006 void TransliteratorTest::TestLocaleResource() {
3007     const char* DATA[] = {
3008         // id                    from               to
3009         //"Latin-Greek/UNGEGN",    "b",               "\\u03bc\\u03c0",
3010         "Latin-el",              "b",               "\\u03bc\\u03c0",
3011         "Latin-Greek",           "b",               "\\u03B2",
3012         "Greek-Latin/UNGEGN",    "\\u03B2",         "v",
3013         "el-Latin",              "\\u03B2",         "v",
3014         "Greek-Latin",           "\\u03B2",         "b",
3015     };
3016     const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3017     for (int32_t i=0; i<DATA_length; i+=3) {
3018         UParseError pe;
3019         UErrorCode ec = U_ZERO_ERROR;
3020         Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, pe, ec);
3021         if (U_FAILURE(ec)) {
3022             dataerrln((UnicodeString)"FAIL: createInstance(" + DATA[i] + ") - " + u_errorName(ec));
3023             delete t;
3024             continue;
3025         }
3026         expect(*t, CharsToUnicodeString(DATA[i+1]),
3027                CharsToUnicodeString(DATA[i+2]));
3028         delete t;
3029     }
3030 }
3031 
3032 /**
3033  * Make sure parse errors reference the right line.
3034  */
TestParseError()3035 void TransliteratorTest::TestParseError() {
3036     static const char* rule =
3037         "a > b;\n"
3038         "# more stuff\n"
3039         "d << b;";
3040     UErrorCode ec = U_ZERO_ERROR;
3041     UParseError pe;
3042     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3043     delete t;
3044     if (U_FAILURE(ec)) {
3045         UnicodeString err(pe.preContext);
3046         err.append((UChar)124/*|*/).append(pe.postContext);
3047         if (err.indexOf("d << b") >= 0) {
3048             logln("Ok: " + err);
3049         } else {
3050             errln("FAIL: " + err);
3051         }
3052     }
3053     else {
3054         errln("FAIL: no syntax error");
3055     }
3056     static const char* maskingRule =
3057         "a>x;\n"
3058         "# more stuff\n"
3059         "ab>y;";
3060     ec = U_ZERO_ERROR;
3061     delete Transliterator::createFromRules("ID", maskingRule, UTRANS_FORWARD, pe, ec);
3062     if (ec != U_RULE_MASK_ERROR) {
3063         errln("FAIL: returned %s instead of U_RULE_MASK_ERROR", u_errorName(ec));
3064     }
3065     else if (UnicodeString("a > x;") != UnicodeString(pe.preContext)) {
3066         errln("FAIL: did not get expected precontext");
3067     }
3068     else if (UnicodeString("ab > y;") != UnicodeString(pe.postContext)) {
3069         errln("FAIL: did not get expected postcontext");
3070     }
3071 }
3072 
3073 /**
3074  * Make sure sets on output are disallowed.
3075  */
TestOutputSet()3076 void TransliteratorTest::TestOutputSet() {
3077     UnicodeString rule = "$set = [a-cm-n]; b > $set;";
3078     UErrorCode ec = U_ZERO_ERROR;
3079     UParseError pe;
3080     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3081     delete t;
3082     if (U_FAILURE(ec)) {
3083         UnicodeString err(pe.preContext);
3084         err.append((UChar)124/*|*/).append(pe.postContext);
3085         logln("Ok: " + err);
3086         return;
3087     }
3088     errln("FAIL: No syntax error");
3089 }
3090 
3091 /**
3092  * Test the use variable range pragma, making sure that use of
3093  * variable range characters is detected and flagged as an error.
3094  */
TestVariableRange()3095 void TransliteratorTest::TestVariableRange() {
3096     UnicodeString rule = "use variable range 0x70 0x72; a > A; b > B; q > Q;";
3097     UErrorCode ec = U_ZERO_ERROR;
3098     UParseError pe;
3099     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3100     delete t;
3101     if (U_FAILURE(ec)) {
3102         UnicodeString err(pe.preContext);
3103         err.append((UChar)124/*|*/).append(pe.postContext);
3104         logln("Ok: " + err);
3105         return;
3106     }
3107     errln("FAIL: No syntax error");
3108 }
3109 
3110 /**
3111  * Test invalid post context error handling
3112  */
TestInvalidPostContext()3113 void TransliteratorTest::TestInvalidPostContext() {
3114     UnicodeString rule = "a}b{c>d;";
3115     UErrorCode ec = U_ZERO_ERROR;
3116     UParseError pe;
3117     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3118     delete t;
3119     if (U_FAILURE(ec)) {
3120         UnicodeString err(pe.preContext);
3121         err.append((UChar)124/*|*/).append(pe.postContext);
3122         if (err.indexOf("a}b{c") >= 0) {
3123             logln("Ok: " + err);
3124         } else {
3125             errln("FAIL: " + err);
3126         }
3127         return;
3128     }
3129     errln("FAIL: No syntax error");
3130 }
3131 
3132 /**
3133  * Test ID form variants
3134  */
TestIDForms()3135 void TransliteratorTest::TestIDForms() {
3136     const char* DATA[] = {
3137         "NFC", NULL, "NFD",
3138         "nfd", NULL, "NFC", // make sure case is ignored
3139         "Any-NFKD", NULL, "Any-NFKC",
3140         "Null", NULL, "Null",
3141         "-nfkc", "nfkc", "NFKD",
3142         "-nfkc/", "nfkc", "NFKD",
3143         "Latin-Greek/UNGEGN", NULL, "Greek-Latin/UNGEGN",
3144         "Greek/UNGEGN-Latin", "Greek-Latin/UNGEGN", "Latin-Greek/UNGEGN",
3145         "Bengali-Devanagari/", "Bengali-Devanagari", "Devanagari-Bengali",
3146         "Source-", NULL, NULL,
3147         "Source/Variant-", NULL, NULL,
3148         "Source-/Variant", NULL, NULL,
3149         "/Variant", NULL, NULL,
3150         "/Variant-", NULL, NULL,
3151         "-/Variant", NULL, NULL,
3152         "-/", NULL, NULL,
3153         "-", NULL, NULL,
3154         "/", NULL, NULL,
3155     };
3156     const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3157 
3158     for (int32_t i=0; i<DATA_length; i+=3) {
3159         const char* ID = DATA[i];
3160         const char* expID = DATA[i+1];
3161         const char* expInvID = DATA[i+2];
3162         UBool expValid = (expInvID != NULL);
3163         if (expID == NULL) {
3164             expID = ID;
3165         }
3166         UParseError pe;
3167         UErrorCode ec = U_ZERO_ERROR;
3168         Transliterator *t =
3169             Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
3170         if (U_FAILURE(ec)) {
3171             if (!expValid) {
3172                 logln((UnicodeString)"Ok: getInstance(" + ID +") => " + u_errorName(ec));
3173             } else {
3174                 dataerrln((UnicodeString)"FAIL: Couldn't create " + ID + " - " + u_errorName(ec));
3175             }
3176             delete t;
3177             continue;
3178         }
3179         Transliterator *u = t->createInverse(ec);
3180         if (U_FAILURE(ec)) {
3181             errln((UnicodeString)"FAIL: Couldn't create inverse of " + ID);
3182             delete t;
3183             delete u;
3184             continue;
3185         }
3186         if (t->getID() == expID &&
3187             u->getID() == expInvID) {
3188             logln((UnicodeString)"Ok: " + ID + ".getInverse() => " + expInvID);
3189         } else {
3190             errln((UnicodeString)"FAIL: getInstance(" + ID + ") => " +
3191                   t->getID() + " x getInverse() => " + u->getID() +
3192                   ", expected " + expInvID);
3193         }
3194         delete t;
3195         delete u;
3196     }
3197 }
3198 
3199 static const UChar SPACE[]   = {32,0};
3200 static const UChar NEWLINE[] = {10,0};
3201 static const UChar RETURN[]  = {13,0};
3202 static const UChar EMPTY[]   = {0};
3203 
checkRules(const UnicodeString & label,Transliterator & t2,const UnicodeString & testRulesForward)3204 void TransliteratorTest::checkRules(const UnicodeString& label, Transliterator& t2,
3205                                     const UnicodeString& testRulesForward) {
3206     UnicodeString rules2; t2.toRules(rules2, true);
3207     //rules2 = TestUtility.replaceAll(rules2, new UnicodeSet("[' '\n\r]"), "");
3208     rules2.findAndReplace(SPACE, EMPTY);
3209     rules2.findAndReplace(NEWLINE, EMPTY);
3210     rules2.findAndReplace(RETURN, EMPTY);
3211 
3212     UnicodeString testRules(testRulesForward); testRules.findAndReplace(SPACE, EMPTY);
3213 
3214     if (rules2 != testRules) {
3215         errln(label);
3216         logln((UnicodeString)"GENERATED RULES: " + rules2);
3217         logln((UnicodeString)"SHOULD BE:       " + testRulesForward);
3218     }
3219 }
3220 
3221 /**
3222  * Mark's toRules test.
3223  */
TestToRulesMark()3224 void TransliteratorTest::TestToRulesMark() {
3225     const char* testRules =
3226         "::[[:Latin:][:Mark:]];"
3227         "::NFKD (NFC);"
3228         "::Lower (Lower);"
3229         "a <> \\u03B1;" // alpha
3230         "::NFKC (NFD);"
3231         "::Upper (Lower);"
3232         "::Lower ();"
3233         "::([[:Greek:][:Mark:]]);"
3234         ;
3235     const char* testRulesForward =
3236         "::[[:Latin:][:Mark:]];"
3237         "::NFKD(NFC);"
3238         "::Lower(Lower);"
3239         "a > \\u03B1;"
3240         "::NFKC(NFD);"
3241         "::Upper (Lower);"
3242         "::Lower ();"
3243         ;
3244     const char* testRulesBackward =
3245         "::[[:Greek:][:Mark:]];"
3246         "::Lower (Upper);"
3247         "::NFD(NFKC);"
3248         "\\u03B1 > a;"
3249         "::Lower(Lower);"
3250         "::NFC(NFKD);"
3251         ;
3252     UnicodeString source = CharsToUnicodeString("\\u00E1"); // a-acute
3253     UnicodeString target = CharsToUnicodeString("\\u03AC"); // alpha-acute
3254 
3255     UParseError pe;
3256     UErrorCode ec = U_ZERO_ERROR;
3257     LocalPointer<Transliterator> t2(
3258             Transliterator::createFromRules("source-target", UnicodeString(testRules, -1, US_INV), UTRANS_FORWARD, pe, ec));
3259     LocalPointer<Transliterator> t3(
3260             Transliterator::createFromRules("target-source", UnicodeString(testRules, -1, US_INV), UTRANS_REVERSE, pe, ec));
3261 
3262     if (U_FAILURE(ec)) {
3263         dataerrln((UnicodeString)"FAIL: createFromRules => " + u_errorName(ec));
3264         return;
3265     }
3266 
3267     expect(*t2, source, target);
3268     expect(*t3, target, source);
3269 
3270     checkRules("Failed toRules FORWARD", *t2, UnicodeString(testRulesForward, -1, US_INV));
3271     checkRules("Failed toRules BACKWARD", *t3, UnicodeString(testRulesBackward, -1, US_INV));
3272 }
3273 
3274 /**
3275  * Test Escape and Unescape transliterators.
3276  */
TestEscape()3277 void TransliteratorTest::TestEscape() {
3278     UParseError pe;
3279     UErrorCode ec;
3280     Transliterator *t;
3281 
3282     ec = U_ZERO_ERROR;
3283     t = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, pe, ec);
3284     if (U_FAILURE(ec)) {
3285         errln((UnicodeString)"FAIL: createInstance");
3286     } else {
3287         expect(*t,
3288                UNICODE_STRING_SIMPLE("\\x{40}\\U00000031&#x32;&#81;"),
3289                "@12Q");
3290     }
3291     delete t;
3292 
3293     ec = U_ZERO_ERROR;
3294     t = Transliterator::createInstance("Any-Hex/C", UTRANS_FORWARD, pe, ec);
3295     if (U_FAILURE(ec)) {
3296         errln((UnicodeString)"FAIL: createInstance");
3297     } else {
3298         expect(*t,
3299                CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3300                UNICODE_STRING_SIMPLE("\\u0041\\U0010BEEF\\uFEED"));
3301     }
3302     delete t;
3303 
3304     ec = U_ZERO_ERROR;
3305     t = Transliterator::createInstance("Any-Hex/Java", UTRANS_FORWARD, pe, ec);
3306     if (U_FAILURE(ec)) {
3307         errln((UnicodeString)"FAIL: createInstance");
3308     } else {
3309         expect(*t,
3310                CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3311                UNICODE_STRING_SIMPLE("\\u0041\\uDBEF\\uDEEF\\uFEED"));
3312     }
3313     delete t;
3314 
3315     ec = U_ZERO_ERROR;
3316     t = Transliterator::createInstance("Any-Hex/Perl", UTRANS_FORWARD, pe, ec);
3317     if (U_FAILURE(ec)) {
3318         errln((UnicodeString)"FAIL: createInstance");
3319     } else {
3320         expect(*t,
3321                CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3322                UNICODE_STRING_SIMPLE("\\x{41}\\x{10BEEF}\\x{FEED}"));
3323     }
3324     delete t;
3325 }
3326 
3327 
TestAnchorMasking()3328 void TransliteratorTest::TestAnchorMasking(){
3329     UnicodeString rule ("^a > Q; a > q;");
3330     UErrorCode status= U_ZERO_ERROR;
3331     UParseError parseError;
3332 
3333     Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status);
3334     if(U_FAILURE(status)){
3335         errln(UnicodeString("FAIL: ") + "ID" +
3336               ".createFromRules() => bad rules" +
3337               /*", parse error " + parseError.code +*/
3338               ", line " + parseError.line +
3339               ", offset " + parseError.offset +
3340               ", context " + prettify(parseError.preContext, true) +
3341               ", rules: " + prettify(rule, true));
3342     }
3343     delete t;
3344 }
3345 
3346 /**
3347  * Make sure display names of variants look reasonable.
3348  */
TestDisplayName()3349 void TransliteratorTest::TestDisplayName() {
3350 #if UCONFIG_NO_FORMATTING
3351     logln("Skipping, UCONFIG_NO_FORMATTING is set\n");
3352     return;
3353 #else
3354     static const char* DATA[] = {
3355         // ID, forward name, reverse name
3356         // Update the text as necessary -- the important thing is
3357         // not the text itself, but how various cases are handled.
3358 
3359         // Basic test
3360         "Any-Hex", "Any to Hex Escape", "Hex Escape to Any",
3361 
3362         // Variants
3363         "Any-Hex/Perl", "Any to Hex Escape/Perl", "Hex Escape to Any/Perl",
3364 
3365         // Target-only IDs
3366         "NFC", "Any to NFC", "Any to NFD",
3367     };
3368 
3369     int32_t DATA_length = UPRV_LENGTHOF(DATA);
3370 
3371     Locale US("en", "US");
3372 
3373     for (int32_t i=0; i<DATA_length; i+=3) {
3374         UnicodeString name;
3375         Transliterator::getDisplayName(DATA[i], US, name);
3376         if (name != DATA[i+1]) {
3377             dataerrln((UnicodeString)"FAIL: " + DATA[i] + ".getDisplayName() => " +
3378                   name + ", expected " + DATA[i+1]);
3379         } else {
3380             logln((UnicodeString)"Ok: " + DATA[i] + ".getDisplayName() => " + name);
3381         }
3382         UErrorCode ec = U_ZERO_ERROR;
3383         UParseError pe;
3384         Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_REVERSE, pe, ec);
3385         if (U_FAILURE(ec)) {
3386             delete t;
3387             dataerrln("FAIL: createInstance failed - %s", u_errorName(ec));
3388             continue;
3389         }
3390         name = Transliterator::getDisplayName(t->getID(), US, name);
3391         if (name != DATA[i+2]) {
3392             dataerrln((UnicodeString)"FAIL: " + t->getID() + ".getDisplayName() => " +
3393                   name + ", expected " + DATA[i+2]);
3394         } else {
3395             logln((UnicodeString)"Ok: " + t->getID() + ".getDisplayName() => " + name);
3396         }
3397         delete t;
3398     }
3399 #endif
3400 }
3401 
TestSpecialCases(void)3402 void TransliteratorTest::TestSpecialCases(void) {
3403     const UnicodeString registerRules[] = {
3404         "Any-Dev1", "x > X; y > Y;",
3405         "Any-Dev2", "XY > Z",
3406         "Greek-Latin/FAKE",
3407             CharsToUnicodeString
3408             ("[^[:L:][:M:]] { \\u03bc\\u03c0 > b ; \\u03bc\\u03c0 } [^[:L:][:M:]] > b ; [^[:L:][:M:]] { [\\u039c\\u03bc][\\u03a0\\u03c0] > B ; [\\u039c\\u03bc][\\u03a0\\u03c0] } [^[:L:][:M:]] > B ;"),
3409         "" // END MARKER
3410     };
3411 
3412     const UnicodeString testCases[] = {
3413         // NORMALIZATION
3414         // should add more test cases
3415         "NFD" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3416         "NFC" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3417         "NFKD", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3418         "NFKC", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3419 
3420         // mp -> b BUG
3421         "Greek-Latin/UNGEGN", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3422         "Greek-Latin/FAKE", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3423 
3424         // check for devanagari bug
3425         "nfd;Dev1;Dev2;nfc", "xy", "Z",
3426 
3427         // ff, i, dotless-i, I, dotted-I, LJLjlj deseret deeDEE
3428         "Title", CharsToUnicodeString("ab'cD ffi\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3429                  CharsToUnicodeString("Ab'cd Ffi\\u0131ii\\u0307 \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3430 
3431         //TODO: enable this test once Titlecase works right
3432         /*
3433         "Title", CharsToUnicodeString("\\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3434                  CharsToUnicodeString("Ffi\\u0131ii \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3435                  */
3436         "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3437                  CharsToUnicodeString("AB'CD FFIII\\u0130 \\u01C7\\u01C7\\u01C7 ") + DESERET_DEE + DESERET_DEE,
3438         "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3439                  CharsToUnicodeString("ab'cd \\uFB00i\\u0131ii\\u0307 \\u01C9\\u01C9\\u01C9 ") + DESERET_dee + DESERET_dee,
3440 
3441         "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3442         "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3443 
3444          // FORMS OF S
3445         "Greek-Latin/UNGEGN",  CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3446                                CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3447         "Latin-Greek/UNGEGN",  CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3448                                CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3") ,
3449         "Greek-Latin",  CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3450                         CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3451         "Latin-Greek",  CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3452                         CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3453         // Tatiana bug
3454         // Upper: TAT\\u02B9\\u00C2NA
3455         // Lower: tat\\u02B9\\u00E2na
3456         // Title: Tat\\u02B9\\u00E2na
3457         "Upper", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3458                  CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3459         "Lower", CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3460                  CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3461         "Title", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3462                  CharsToUnicodeString("Tat\\u02B9\\u00E2na"),
3463 
3464         "" // END MARKER
3465     };
3466 
3467     UParseError pos;
3468     int32_t i;
3469     for (i = 0; registerRules[i].length()!=0; i+=2) {
3470         UErrorCode status = U_ZERO_ERROR;
3471 
3472         Transliterator *t = Transliterator::createFromRules(registerRules[0+i],
3473             registerRules[i+1], UTRANS_FORWARD, pos, status);
3474         if (U_FAILURE(status)) {
3475             dataerrln("Fails: Unable to create the transliterator from rules. - %s", u_errorName(status));
3476         } else {
3477             Transliterator::registerInstance(t);
3478         }
3479     }
3480     for (i = 0; testCases[i].length()!=0; i+=3) {
3481         UErrorCode ec = U_ZERO_ERROR;
3482         UParseError pe;
3483         const UnicodeString& name = testCases[i];
3484         Transliterator *t = Transliterator::createInstance(name, UTRANS_FORWARD, pe, ec);
3485         if (U_FAILURE(ec)) {
3486             dataerrln((UnicodeString)"FAIL: Couldn't create " + name + " - " + u_errorName(ec));
3487             delete t;
3488             continue;
3489         }
3490         const UnicodeString& id = t->getID();
3491         const UnicodeString& source = testCases[i+1];
3492         UnicodeString target;
3493 
3494         // Automatic generation of targets, to make it simpler to add test cases (and more fail-safe)
3495 
3496         if (testCases[i+2].length() > 0) {
3497             target = testCases[i+2];
3498         } else if (0==id.caseCompare("NFD", U_FOLD_CASE_DEFAULT)) {
3499             Normalizer::normalize(source, UNORM_NFD, 0, target, ec);
3500         } else if (0==id.caseCompare("NFC", U_FOLD_CASE_DEFAULT)) {
3501             Normalizer::normalize(source, UNORM_NFC, 0, target, ec);
3502         } else if (0==id.caseCompare("NFKD", U_FOLD_CASE_DEFAULT)) {
3503             Normalizer::normalize(source, UNORM_NFKD, 0, target, ec);
3504         } else if (0==id.caseCompare("NFKC", U_FOLD_CASE_DEFAULT)) {
3505             Normalizer::normalize(source, UNORM_NFKC, 0, target, ec);
3506         } else if (0==id.caseCompare("Lower", U_FOLD_CASE_DEFAULT)) {
3507             target = source;
3508             target.toLower(Locale::getUS());
3509         } else if (0==id.caseCompare("Upper", U_FOLD_CASE_DEFAULT)) {
3510             target = source;
3511             target.toUpper(Locale::getUS());
3512         }
3513         if (U_FAILURE(ec)) {
3514             errln((UnicodeString)"FAIL: Internal error normalizing " + source);
3515             continue;
3516         }
3517 
3518         expect(*t, source, target);
3519         delete t;
3520     }
3521     for (i = 0; registerRules[i].length()!=0; i+=2) {
3522         Transliterator::unregister(registerRules[i]);
3523     }
3524 }
3525 
Char32ToEscapedChars(UChar32 ch,char * buffer)3526 char* Char32ToEscapedChars(UChar32 ch, char* buffer) {
3527     if (ch <= 0xFFFF) {
3528         sprintf(buffer, "\\u%04x", (int)ch);
3529     } else {
3530         sprintf(buffer, "\\U%08x", (int)ch);
3531     }
3532     return buffer;
3533 }
3534 
TestSurrogateCasing(void)3535 void TransliteratorTest::TestSurrogateCasing (void) {
3536     // check that casing handles surrogates
3537     // titlecase is currently defective
3538     char buffer[20];
3539     UChar buffer2[20];
3540     UChar32 dee;
3541     U16_GET(DESERET_dee,0, 0, DESERET_dee.length(), dee);
3542     UnicodeString DEE(u_totitle(dee));
3543     if (DEE != DESERET_DEE) {
3544         err("Fails titlecase of surrogates");
3545         err(Char32ToEscapedChars(dee, buffer));
3546         err(", ");
3547         errln(Char32ToEscapedChars(DEE.char32At(0), buffer));
3548     }
3549 
3550     UnicodeString deeDEETest=DESERET_dee + DESERET_DEE;
3551     UnicodeString deedeeTest = DESERET_dee + DESERET_dee;
3552     UnicodeString DEEDEETest = DESERET_DEE + DESERET_DEE;
3553     UErrorCode status= U_ZERO_ERROR;
3554 
3555     u_strToUpper(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
3556     if (U_FAILURE(status) || (UnicodeString(buffer2)!= DEEDEETest)) {
3557         errln("Fails: Can't uppercase surrogates.");
3558     }
3559 
3560     status= U_ZERO_ERROR;
3561     u_strToLower(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
3562     if (U_FAILURE(status) || (UnicodeString(buffer2)!= deedeeTest)) {
3563         errln("Fails: Can't lowercase surrogates.");
3564     }
3565 }
3566 
_trans(Transliterator & t,const UnicodeString & src,UnicodeString & result)3567 static void _trans(Transliterator& t, const UnicodeString& src,
3568                    UnicodeString& result) {
3569     result = src;
3570     t.transliterate(result);
3571 }
3572 
_trans(const UnicodeString & id,const UnicodeString & src,UnicodeString & result,UErrorCode ec)3573 static void _trans(const UnicodeString& id, const UnicodeString& src,
3574                    UnicodeString& result, UErrorCode ec) {
3575     UParseError pe;
3576     Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
3577     if (U_SUCCESS(ec)) {
3578         _trans(*t, src, result);
3579     }
3580     delete t;
3581 }
3582 
_findMatch(const UnicodeString & source,const UnicodeString * pairs)3583 static UnicodeString _findMatch(const UnicodeString& source,
3584                                        const UnicodeString* pairs) {
3585     UnicodeString empty;
3586     for (int32_t i=0; pairs[i].length() > 0; i+=2) {
3587         if (0==source.caseCompare(pairs[i], U_FOLD_CASE_DEFAULT)) {
3588             return pairs[i+1];
3589         }
3590     }
3591     return empty;
3592 }
3593 
3594 // Check to see that incremental gets at least part way through a reasonable string.
3595 
TestIncrementalProgress(void)3596 void TransliteratorTest::TestIncrementalProgress(void) {
3597     UErrorCode ec = U_ZERO_ERROR;
3598     UnicodeString latinTest = "The Quick Brown Fox.";
3599     UnicodeString devaTest;
3600     _trans("Latin-Devanagari", latinTest, devaTest, ec);
3601     UnicodeString kataTest;
3602     _trans("Latin-Katakana", latinTest, kataTest, ec);
3603     if (U_FAILURE(ec)) {
3604         errln("FAIL: Internal error");
3605         return;
3606     }
3607     const UnicodeString tests[] = {
3608         "Any", latinTest,
3609         "Latin", latinTest,
3610         "Halfwidth", latinTest,
3611         "Devanagari", devaTest,
3612         "Katakana", kataTest,
3613         "" // END MARKER
3614     };
3615 
3616     UnicodeString test("The Quick Brown Fox Jumped Over The Lazy Dog.");
3617     int32_t i = 0, j=0, k=0;
3618     int32_t sources = Transliterator::countAvailableSources();
3619     for (i = 0; i < sources; i++) {
3620         UnicodeString source;
3621         Transliterator::getAvailableSource(i, source);
3622         UnicodeString test = _findMatch(source, tests);
3623         if (test.length() == 0) {
3624             logln((UnicodeString)"Skipping " + source + "-X");
3625             continue;
3626         }
3627         int32_t targets = Transliterator::countAvailableTargets(source);
3628         for (j = 0; j < targets; j++) {
3629             UnicodeString target;
3630             Transliterator::getAvailableTarget(j, source, target);
3631             int32_t variants = Transliterator::countAvailableVariants(source, target);
3632             for (k =0; k< variants; k++) {
3633                 UnicodeString variant;
3634                 UParseError err;
3635                 UErrorCode status = U_ZERO_ERROR;
3636 
3637                 Transliterator::getAvailableVariant(k, source, target, variant);
3638                 UnicodeString id = source + "-" + target + "/" + variant;
3639 
3640                 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, err, status);
3641                 if (U_FAILURE(status)) {
3642                     dataerrln((UnicodeString)"FAIL: Could not create " + id + ", status " + u_errorName(status));
3643                     delete t;
3644                     continue;
3645                 }
3646                 status = U_ZERO_ERROR;
3647                 CheckIncrementalAux(t, test);
3648 
3649                 UnicodeString rev;
3650                 _trans(*t, test, rev);
3651                 Transliterator *inv = t->createInverse(status);
3652                 if (U_FAILURE(status)) {
3653                     // The following are forward-only, it is OK that creating an inverse will not work:
3654                     // 1. Devanagari-Arabic
3655                     // 2. Any-*/BGN
3656                     // 2a. Any-*/BGN_1981
3657                     // 3. Any-*/MNS
3658                     //
3659                     // 4. If UCONFIG_NO_BREAK_ITERATION is on, Latin-Thai is also not expected to work.
3660                     //
3661                     // The following are direction="both" transforms with variants, inverting the Any-Xxxx/Variant for
3662                     // any of these does not work; see ICU-21911 (not sure whether this is intentional or an ICU bug).
3663                     // Unfortunately we do not easily have the info at this point as to whether the original transform
3664                     // had direction="both" specified.
3665                     // 5. Any-*/UNGEGN
3666                     // 6. Any-Ethiopic/*
3667                     // 7. Any-Braille/*
3668                     // 8. Any-*/Gurage_2013
3669                     // 9. Any-*/Gutgarts
3670                     // 10. Any-*/Tekie_Alibekit
3671                     // 11. Any-*/Xaleget
3672                     //
3673                     if (    id.compare((UnicodeString)"Devanagari-Arabic/") != 0
3674                          && !(id.startsWith((UnicodeString)"Any-") &&
3675                                 (id.endsWith((UnicodeString)"/BGN") || id.endsWith((UnicodeString)"/BGN_1981") || id.endsWith((UnicodeString)"/MNS"))
3676                              )
3677 #if UCONFIG_NO_BREAK_ITERATION
3678                          && id.compare((UnicodeString)"Latin-Thai/") != 0
3679 #endif
3680                          && !(logKnownIssue("21911", "ICU4C cannot create inverse of Any-Xxxx/Variant transform created from both-direction transform") &&
3681                                 id.startsWith((UnicodeString)"Any-") &&
3682                                 (id.endsWith((UnicodeString)"/UNGEGN") || id.startsWith((UnicodeString)"Any-Ethiopic/") || id.startsWith((UnicodeString)"Any-Braille/") ||
3683                                  id.endsWith((UnicodeString)"/Gurage_2013") || id.endsWith((UnicodeString)"/Gutgarts") || id.endsWith((UnicodeString)"/Tekie_Alibekit") ||
3684                                  id.endsWith((UnicodeString)"/Xaleget"))
3685                              )
3686                        )
3687                     {
3688                         errln((UnicodeString)"FAIL: Could not create inverse of " + id + ", status " + u_errorName(status));
3689                     }
3690                     delete t;
3691                     delete inv;
3692                     continue;
3693                 }
3694                 CheckIncrementalAux(inv, rev);
3695                 delete t;
3696                 delete inv;
3697             }
3698         }
3699     }
3700 }
3701 
CheckIncrementalAux(const Transliterator * t,const UnicodeString & input)3702 void TransliteratorTest::CheckIncrementalAux(const Transliterator* t,
3703                                                       const UnicodeString& input) {
3704     UErrorCode ec = U_ZERO_ERROR;
3705     UTransPosition pos;
3706     UnicodeString test = input;
3707 
3708     pos.contextStart = 0;
3709     pos.contextLimit = input.length();
3710     pos.start = 0;
3711     pos.limit = input.length();
3712 
3713     t->transliterate(test, pos, ec);
3714     if (U_FAILURE(ec)) {
3715         errln((UnicodeString)"FAIL: transliterate() error " + u_errorName(ec));
3716         return;
3717     }
3718     UBool gotError = false;
3719     (void)gotError;    // Suppress set but not used warning.
3720 
3721     // we have a few special cases. Any-Remove (pos.start = 0, but also = limit) and U+XXXXX?X?
3722 
3723     if (pos.start == 0 && pos.limit != 0 && t->getID() != "Hex-Any/Unicode") {
3724         errln((UnicodeString)"No Progress, " +
3725               t->getID() + ": " + formatInput(test, input, pos));
3726         gotError = true;
3727     } else {
3728         logln((UnicodeString)"PASS Progress, " +
3729               t->getID() + ": " + formatInput(test, input, pos));
3730     }
3731     t->finishTransliteration(test, pos);
3732     if (pos.start != pos.limit) {
3733         errln((UnicodeString)"Incomplete, " +
3734               t->getID() + ": " + formatInput(test, input, pos));
3735         gotError = true;
3736     }
3737 }
3738 
TestFunction()3739 void TransliteratorTest::TestFunction() {
3740     // Careful with spacing and ';' here:  Phrase this exactly
3741     // as toRules() is going to return it.  If toRules() changes
3742     // with regard to spacing or ';', then adjust this string.
3743     UnicodeString rule =
3744         "([:Lu:]) > $1 '(' &Lower( $1 ) '=' &Hex( &Any-Lower( $1 ) ) ')';";
3745 
3746     UParseError pe;
3747     UErrorCode ec = U_ZERO_ERROR;
3748     Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3749     if (t == NULL) {
3750         dataerrln("FAIL: createFromRules failed - %s", u_errorName(ec));
3751         return;
3752     }
3753 
3754     UnicodeString r;
3755     t->toRules(r, true);
3756     if (r == rule) {
3757         logln((UnicodeString)"OK: toRules() => " + r);
3758     } else {
3759         errln((UnicodeString)"FAIL: toRules() => " + r +
3760               ", expected " + rule);
3761     }
3762 
3763     expect(*t, "The Quick Brown Fox",
3764            UNICODE_STRING_SIMPLE("T(t=\\u0074)he Q(q=\\u0071)uick B(b=\\u0062)rown F(f=\\u0066)ox"));
3765 
3766     delete t;
3767 }
3768 
TestInvalidBackRef(void)3769 void TransliteratorTest::TestInvalidBackRef(void) {
3770     UnicodeString rule =  ". > $1;";
3771     UnicodeString rule2 =CharsToUnicodeString("(.) <> &hex/unicode($1) &name($1); . > $1; [{}] >\\u0020;");
3772     UParseError pe;
3773     UErrorCode ec = U_ZERO_ERROR;
3774     Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3775     Transliterator *t2 = Transliterator::createFromRules("Test2", rule2, UTRANS_FORWARD, pe, ec);
3776 
3777     if (t != NULL) {
3778         errln("FAIL: createFromRules should have returned NULL");
3779         delete t;
3780     }
3781 
3782     if (t2 != NULL) {
3783         errln("FAIL: createFromRules should have returned NULL");
3784         delete t2;
3785     }
3786 
3787     if (U_SUCCESS(ec)) {
3788         errln("FAIL: Ok: . > $1; => no error");
3789     } else {
3790         logln((UnicodeString)"Ok: . > $1; => " + u_errorName(ec));
3791     }
3792 }
3793 
TestMulticharStringSet()3794 void TransliteratorTest::TestMulticharStringSet() {
3795     // Basic testing
3796     const char* rule =
3797         "       [{aa}]       > x;"
3798         "         a          > y;"
3799         "       [b{bc}]      > z;"
3800         "[{gd}] { e          > q;"
3801         "         e } [{fg}] > r;" ;
3802 
3803     UParseError pe;
3804     UErrorCode ec = U_ZERO_ERROR;
3805     Transliterator* t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3806     if (t == NULL || U_FAILURE(ec)) {
3807         delete t;
3808         errln("FAIL: createFromRules failed");
3809         return;
3810     }
3811 
3812     expect(*t, "a aa ab bc d gd de gde gdefg ddefg",
3813            "y x yz z d gd de gdq gdqfg ddrfg");
3814     delete t;
3815 
3816     // Overlapped string test.  Make sure that when multiple
3817     // strings can match that the longest one is matched.
3818     rule =
3819         "    [a {ab} {abc}]    > x;"
3820         "           b          > y;"
3821         "           c          > z;"
3822         " q [t {st} {rst}] { e > p;" ;
3823 
3824     t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3825     if (t == NULL || U_FAILURE(ec)) {
3826         delete t;
3827         errln("FAIL: createFromRules failed");
3828         return;
3829     }
3830 
3831     expect(*t, "a ab abc qte qste qrste",
3832            "x x x qtp qstp qrstp");
3833     delete t;
3834 }
3835 
3836 // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
3837 // BEGIN TestUserFunction support factory
3838 
3839 Transliterator* _TUFF[4];
3840 UnicodeString* _TUFID[4];
3841 
_TUFFactory(const UnicodeString &,Transliterator::Token context)3842 static Transliterator* U_EXPORT2 _TUFFactory(const UnicodeString& /*ID*/,
3843                                    Transliterator::Token context) {
3844     return _TUFF[context.integer]->clone();
3845 }
3846 
_TUFReg(const UnicodeString & ID,Transliterator * t,int32_t n)3847 static void _TUFReg(const UnicodeString& ID, Transliterator* t, int32_t n) {
3848     _TUFF[n] = t;
3849     _TUFID[n] = new UnicodeString(ID);
3850     Transliterator::registerFactory(ID, _TUFFactory, Transliterator::integerToken(n));
3851 }
3852 
_TUFUnreg(int32_t n)3853 static void _TUFUnreg(int32_t n) {
3854     if (_TUFF[n] != NULL) {
3855         Transliterator::unregister(*_TUFID[n]);
3856         delete _TUFF[n];
3857         delete _TUFID[n];
3858     }
3859 }
3860 
3861 // END TestUserFunction support factory
3862 // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3863 
3864 /**
3865  * Test that user-registered transliterators can be used under function
3866  * syntax.
3867  */
TestUserFunction()3868 void TransliteratorTest::TestUserFunction() {
3869 
3870     Transliterator* t;
3871     UParseError pe;
3872     UErrorCode ec = U_ZERO_ERROR;
3873 
3874     // Setup our factory
3875     int32_t i;
3876     for (i=0; i<4; ++i) {
3877         _TUFF[i] = NULL;
3878     }
3879 
3880     // There's no need to register inverses if we don't use them
3881     t = Transliterator::createFromRules("gif",
3882                                         UNICODE_STRING_SIMPLE("'\\'u(..)(..) > '<img src=\"http://www.unicode.org/gifs/24/' $1 '/U' $1$2 '.gif\">';"),
3883                                         UTRANS_FORWARD, pe, ec);
3884     if (t == NULL || U_FAILURE(ec)) {
3885         dataerrln((UnicodeString)"FAIL: createFromRules gif " + u_errorName(ec));
3886         return;
3887     }
3888     _TUFReg("Any-gif", t, 0);
3889 
3890     t = Transliterator::createFromRules("RemoveCurly",
3891                                         UNICODE_STRING_SIMPLE("[\\{\\}] > ; '\\N' > ;"),
3892                                         UTRANS_FORWARD, pe, ec);
3893     if (t == NULL || U_FAILURE(ec)) {
3894         errln((UnicodeString)"FAIL: createFromRules RemoveCurly " + u_errorName(ec));
3895         goto FAIL;
3896     }
3897     expect(*t, UNICODE_STRING_SIMPLE("\\N{name}"), "name");
3898     _TUFReg("Any-RemoveCurly", t, 1);
3899 
3900     logln("Trying &hex");
3901     t = Transliterator::createFromRules("hex2",
3902                                         "(.) > &hex($1);",
3903                                         UTRANS_FORWARD, pe, ec);
3904     if (t == NULL || U_FAILURE(ec)) {
3905         errln("FAIL: createFromRules");
3906         goto FAIL;
3907     }
3908     logln("Registering");
3909     _TUFReg("Any-hex2", t, 2);
3910     t = Transliterator::createInstance("Any-hex2", UTRANS_FORWARD, ec);
3911     if (t == NULL || U_FAILURE(ec)) {
3912         errln((UnicodeString)"FAIL: createInstance Any-hex2 " + u_errorName(ec));
3913         goto FAIL;
3914     }
3915     expect(*t, "abc", UNICODE_STRING_SIMPLE("\\u0061\\u0062\\u0063"));
3916     delete t;
3917 
3918     logln("Trying &gif");
3919     t = Transliterator::createFromRules("gif2",
3920                                         "(.) > &Gif(&Hex2($1));",
3921                                         UTRANS_FORWARD, pe, ec);
3922     if (t == NULL || U_FAILURE(ec)) {
3923         errln((UnicodeString)"FAIL: createFromRules gif2 " + u_errorName(ec));
3924         goto FAIL;
3925     }
3926     logln("Registering");
3927     _TUFReg("Any-gif2", t, 3);
3928     t = Transliterator::createInstance("Any-gif2", UTRANS_FORWARD, ec);
3929     if (t == NULL || U_FAILURE(ec)) {
3930         errln((UnicodeString)"FAIL: createInstance Any-gif2 " + u_errorName(ec));
3931         goto FAIL;
3932     }
3933     expect(*t, "ab", "<img src=\"http://www.unicode.org/gifs/24/00/U0061.gif\">"
3934            "<img src=\"http://www.unicode.org/gifs/24/00/U0062.gif\">");
3935     delete t;
3936 
3937     // Test that filters are allowed after &
3938     t = Transliterator::createFromRules("test",
3939                                         "(.) > &Hex($1) ' ' &RemoveCurly(&Name($1)) ' ';",
3940                                         UTRANS_FORWARD, pe, ec);
3941     if (t == NULL || U_FAILURE(ec)) {
3942         errln((UnicodeString)"FAIL: createFromRules test " + u_errorName(ec));
3943         goto FAIL;
3944     }
3945     expect(*t, "abc",
3946            UNICODE_STRING_SIMPLE("\\u0061 LATIN SMALL LETTER A \\u0062 LATIN SMALL LETTER B \\u0063 LATIN SMALL LETTER C "));
3947     delete t;
3948 
3949  FAIL:
3950     for (i=0; i<4; ++i) {
3951         _TUFUnreg(i);
3952     }
3953 }
3954 
3955 /**
3956  * Test the Any-X transliterators.
3957  */
TestAnyX(void)3958 void TransliteratorTest::TestAnyX(void) {
3959     UParseError parseError;
3960     UErrorCode status = U_ZERO_ERROR;
3961     Transliterator* anyLatin =
3962         Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
3963     if (anyLatin==0) {
3964         dataerrln("FAIL: createInstance returned NULL - %s", u_errorName(status));
3965         delete anyLatin;
3966         return;
3967     }
3968 
3969     expect(*anyLatin,
3970            CharsToUnicodeString("greek:\\u03B1\\u03B2\\u03BA\\u0391\\u0392\\u039A hiragana:\\u3042\\u3076\\u304F cyrillic:\\u0430\\u0431\\u0446"),
3971            CharsToUnicodeString("greek:abkABK hiragana:abuku cyrillic:abc"));
3972 
3973     delete anyLatin;
3974 }
3975 
3976 /**
3977  * Test Any-X transliterators with sample letters from all scripts.
3978  */
TestAny(void)3979 void TransliteratorTest::TestAny(void) {
3980     UErrorCode status = U_ZERO_ERROR;
3981     // Note: there is a lot of implicit construction of UnicodeStrings from (char *) in
3982     //       function call parameters going on in this test.
3983     UnicodeSet alphabetic("[:alphabetic:]", status);
3984     if (U_FAILURE(status)) {
3985         dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
3986         return;
3987     }
3988     alphabetic.freeze();
3989 
3990     UnicodeString testString;
3991     for (int32_t i = 0; i < USCRIPT_CODE_LIMIT; i++) {
3992         const char *scriptName = uscript_getShortName((UScriptCode)i);
3993         if (scriptName == NULL) {
3994             errln("Failure: file %s, line %d: Script Code %d is invalid, ", __FILE__, __LINE__, i);
3995             return;
3996         }
3997 
3998         UnicodeSet sample;
3999         sample.applyPropertyAlias("script", scriptName, status);
4000         if (U_FAILURE(status)) {
4001             errln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
4002             return;
4003         }
4004         sample.retainAll(alphabetic);
4005         for (int32_t count=0; count<5; count++) {
4006             UChar32 c = sample.charAt(count);
4007             if (c == -1) {
4008                 break;
4009             }
4010             testString.append(c);
4011         }
4012     }
4013 
4014     UParseError parseError;
4015     Transliterator* anyLatin =
4016         Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
4017     if (U_FAILURE(status)) {
4018         dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
4019         return;
4020     }
4021 
4022     logln(UnicodeString("Sample set for Any-Latin: ") + testString);
4023     anyLatin->transliterate(testString);
4024     logln(UnicodeString("Sample result for Any-Latin: ") + testString);
4025     delete anyLatin;
4026 }
4027 
4028 
4029 /**
4030  * Test the source and target set API.  These are only implemented
4031  * for RBT and CompoundTransliterator at this time.
4032  */
TestSourceTargetSet()4033 void TransliteratorTest::TestSourceTargetSet() {
4034     UErrorCode ec = U_ZERO_ERROR;
4035 
4036     // Rules
4037     const char* r =
4038         "a > b; "
4039         "r [x{lu}] > q;";
4040 
4041     // Expected source
4042     UnicodeSet expSrc("[arx{lu}]", ec);
4043 
4044     // Expected target
4045     UnicodeSet expTrg("[bq]", ec);
4046 
4047     UParseError pe;
4048     Transliterator* t = Transliterator::createFromRules("test", r, UTRANS_FORWARD, pe, ec);
4049 
4050     if (U_FAILURE(ec)) {
4051         delete t;
4052         errln("FAIL: Couldn't set up test");
4053         return;
4054     }
4055 
4056     UnicodeSet src; t->getSourceSet(src);
4057     UnicodeSet trg; t->getTargetSet(trg);
4058 
4059     if (src == expSrc && trg == expTrg) {
4060         UnicodeString a, b;
4061         logln((UnicodeString)"Ok: " +
4062               r + " => source = " + src.toPattern(a, true) +
4063               ", target = " + trg.toPattern(b, true));
4064     } else {
4065         UnicodeString a, b, c, d;
4066         errln((UnicodeString)"FAIL: " +
4067               r + " => source = " + src.toPattern(a, true) +
4068               ", expected " + expSrc.toPattern(b, true) +
4069               "; target = " + trg.toPattern(c, true) +
4070               ", expected " + expTrg.toPattern(d, true));
4071     }
4072 
4073     delete t;
4074 }
4075 
4076 /**
4077  * Test handling of Pattern_White_Space, for both RBT and UnicodeSet.
4078  */
TestPatternWhiteSpace()4079 void TransliteratorTest::TestPatternWhiteSpace() {
4080     // Rules
4081     const char* r = "a > \\u200E b;";
4082 
4083     UErrorCode ec = U_ZERO_ERROR;
4084     UParseError pe;
4085     Transliterator* t = Transliterator::createFromRules("test", CharsToUnicodeString(r), UTRANS_FORWARD, pe, ec);
4086 
4087     if (U_FAILURE(ec)) {
4088         errln("FAIL: Couldn't set up test");
4089     } else {
4090         expect(*t, "a", "b");
4091     }
4092     delete t;
4093 
4094     // UnicodeSet
4095     ec = U_ZERO_ERROR;
4096     UnicodeSet set(CharsToUnicodeString("[a \\u200E]"), ec);
4097 
4098     if (U_FAILURE(ec)) {
4099         errln("FAIL: Couldn't set up test");
4100     } else {
4101         if (set.contains(0x200E)) {
4102             errln("FAIL: U+200E not being ignored by UnicodeSet");
4103         }
4104     }
4105 }
4106 //======================================================================
4107 // this method is in TestUScript.java
4108 //======================================================================
TestAllCodepoints()4109 void TransliteratorTest::TestAllCodepoints(){
4110     UScriptCode code= USCRIPT_INVALID_CODE;
4111     char id[256]={'\0'};
4112     char abbr[256]={'\0'};
4113     char newId[256]={'\0'};
4114     char newAbbrId[256]={'\0'};
4115     char oldId[256]={'\0'};
4116     char oldAbbrId[256]={'\0'};
4117 
4118     UErrorCode status =U_ZERO_ERROR;
4119     UParseError pe;
4120 
4121     for(uint32_t i = 0; i<=0x10ffff; i++){
4122         code =  uscript_getScript(i,&status);
4123         if(code == USCRIPT_INVALID_CODE){
4124             dataerrln("uscript_getScript for codepoint \\U%08X failed.", i);
4125         }
4126         const char* myId = uscript_getName(code);
4127         if(!myId) {
4128           dataerrln("Valid script code returned NULL name. Check your data!");
4129           return;
4130         }
4131         uprv_strcpy(id,myId);
4132         uprv_strcpy(abbr,uscript_getShortName(code));
4133 
4134         uprv_strcpy(newId,"[:");
4135         uprv_strcat(newId,id);
4136         uprv_strcat(newId,":];NFD");
4137 
4138         uprv_strcpy(newAbbrId,"[:");
4139         uprv_strcat(newAbbrId,abbr);
4140         uprv_strcat(newAbbrId,":];NFD");
4141 
4142         if(uprv_strcmp(newId,oldId)!=0){
4143             Transliterator* t = Transliterator::createInstance(newId,UTRANS_FORWARD,pe,status);
4144             if(t==NULL || U_FAILURE(status)){
4145                 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4146             }
4147             delete t;
4148         }
4149         if(uprv_strcmp(newAbbrId,oldAbbrId)!=0){
4150             Transliterator* t = Transliterator::createInstance(newAbbrId,UTRANS_FORWARD,pe,status);
4151             if(t==NULL || U_FAILURE(status)){
4152                 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4153             }
4154             delete t;
4155         }
4156         uprv_strcpy(oldId,newId);
4157         uprv_strcpy(oldAbbrId, newAbbrId);
4158 
4159     }
4160 
4161 }
4162 
4163 #define TEST_TRANSLIT_ID(id, cls) UPRV_BLOCK_MACRO_BEGIN { \
4164   UErrorCode ec = U_ZERO_ERROR; \
4165   Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); \
4166   if (U_FAILURE(ec)) { \
4167     dataerrln("FAIL: Couldn't create %s - %s", id, u_errorName(ec)); \
4168   } else { \
4169     if (t->getDynamicClassID() != cls::getStaticClassID()) { \
4170       errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4171     } \
4172     /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4173   } \
4174   delete t; \
4175 } UPRV_BLOCK_MACRO_END
4176 
4177 #define TEST_TRANSLIT_RULE(rule, cls) UPRV_BLOCK_MACRO_BEGIN { \
4178   UErrorCode ec = U_ZERO_ERROR; \
4179   UParseError pe; \
4180   Transliterator* t = Transliterator::createFromRules("_", rule, UTRANS_FORWARD, pe, ec); \
4181   if (U_FAILURE(ec)) { \
4182     errln("FAIL: Couldn't create " rule); \
4183   } else { \
4184     if (t->getDynamicClassID() != cls ::getStaticClassID()) { \
4185       errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4186     } \
4187     /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4188   } \
4189   delete t; \
4190 } UPRV_BLOCK_MACRO_END
4191 
TestBoilerplate()4192 void TransliteratorTest::TestBoilerplate() {
4193     TEST_TRANSLIT_ID("Any-Latin", AnyTransliterator);
4194     TEST_TRANSLIT_ID("Any-Hex", EscapeTransliterator);
4195     TEST_TRANSLIT_ID("Hex-Any", UnescapeTransliterator);
4196     TEST_TRANSLIT_ID("Lower", LowercaseTransliterator);
4197     TEST_TRANSLIT_ID("Upper", UppercaseTransliterator);
4198     TEST_TRANSLIT_ID("Title", TitlecaseTransliterator);
4199     TEST_TRANSLIT_ID("Null", NullTransliterator);
4200     TEST_TRANSLIT_ID("Remove", RemoveTransliterator);
4201     TEST_TRANSLIT_ID("Any-Name", UnicodeNameTransliterator);
4202     TEST_TRANSLIT_ID("Name-Any", NameUnicodeTransliterator);
4203     TEST_TRANSLIT_ID("NFD", NormalizationTransliterator);
4204     TEST_TRANSLIT_ID("Latin-Greek", CompoundTransliterator);
4205     TEST_TRANSLIT_RULE("a>b;", RuleBasedTransliterator);
4206 }
4207 
TestAlternateSyntax()4208 void TransliteratorTest::TestAlternateSyntax() {
4209     // U+2206 == &
4210     // U+2190 == <
4211     // U+2192 == >
4212     // U+2194 == <>
4213     expect(CharsToUnicodeString("a \\u2192 x; b \\u2190 y; c \\u2194 z"),
4214            "abc",
4215            "xbz");
4216     expect(CharsToUnicodeString("([:^ASCII:]) \\u2192 \\u2206Name($1);"),
4217            CharsToUnicodeString("<=\\u2190; >=\\u2192; <>=\\u2194; &=\\u2206"),
4218            UNICODE_STRING_SIMPLE("<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}"));
4219 }
4220 
4221 static const char* BEGIN_END_RULES[] = {
4222     // [0]
4223     "abc > xy;"
4224     "aba > z;",
4225 
4226     // [1]
4227 /*
4228     "::BEGIN;"
4229     "abc > xy;"
4230     "::END;"
4231     "::BEGIN;"
4232     "aba > z;"
4233     "::END;",
4234 */
4235     "", // test case commented out below, this is here to keep from messing up the indexes
4236 
4237     // [2]
4238 /*
4239     "abc > xy;"
4240     "::BEGIN;"
4241     "aba > z;"
4242     "::END;",
4243 */
4244     "", // test case commented out below, this is here to keep from messing up the indexes
4245 
4246     // [3]
4247 /*
4248     "::BEGIN;"
4249     "abc > xy;"
4250     "::END;"
4251     "aba > z;",
4252 */
4253     "", // test case commented out below, this is here to keep from messing up the indexes
4254 
4255     // [4]
4256     "abc > xy;"
4257     "::Null;"
4258     "aba > z;",
4259 
4260     // [5]
4261     "::Upper;"
4262     "ABC > xy;"
4263     "AB > x;"
4264     "C > z;"
4265     "::Upper;"
4266     "XYZ > p;"
4267     "XY > q;"
4268     "Z > r;"
4269     "::Upper;",
4270 
4271     // [6]
4272     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4273     "$delim = [\\-$ws];"
4274     "$ws $delim* > ' ';"
4275     "'-' $delim* > '-';",
4276 
4277     // [7]
4278     "::Null;"
4279     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4280     "$delim = [\\-$ws];"
4281     "$ws $delim* > ' ';"
4282     "'-' $delim* > '-';",
4283 
4284     // [8]
4285     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4286     "$delim = [\\-$ws];"
4287     "$ws $delim* > ' ';"
4288     "'-' $delim* > '-';"
4289     "::Null;",
4290 
4291     // [9]
4292     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4293     "$delim = [\\-$ws];"
4294     "::Null;"
4295     "$ws $delim* > ' ';"
4296     "'-' $delim* > '-';",
4297 
4298     // [10]
4299 /*
4300     "::BEGIN;"
4301     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4302     "$delim = [\\-$ws];"
4303     "::END;"
4304     "$ws $delim* > ' ';"
4305     "'-' $delim* > '-';",
4306 */
4307     "", // test case commented out below, this is here to keep from messing up the indexes
4308 
4309     // [11]
4310 /*
4311     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4312     "$delim = [\\-$ws];"
4313     "::BEGIN;"
4314     "$ws $delim* > ' ';"
4315     "'-' $delim* > '-';"
4316     "::END;",
4317 */
4318     "", // test case commented out below, this is here to keep from messing up the indexes
4319 
4320     // [12]
4321 /*
4322     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4323     "$delim = [\\-$ws];"
4324     "$ab = [ab];"
4325     "::BEGIN;"
4326     "$ws $delim* > ' ';"
4327     "'-' $delim* > '-';"
4328     "::END;"
4329     "::BEGIN;"
4330     "$ab { ' ' } $ab > '-';"
4331     "c { ' ' > ;"
4332     "::END;"
4333     "::BEGIN;"
4334     "'a-a' > a\\%|a;"
4335     "::END;",
4336 */
4337     "", // test case commented out below, this is here to keep from messing up the indexes
4338 
4339     // [13]
4340     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4341     "$delim = [\\-$ws];"
4342     "$ab = [ab];"
4343     "::Null;"
4344     "$ws $delim* > ' ';"
4345     "'-' $delim* > '-';"
4346     "::Null;"
4347     "$ab { ' ' } $ab > '-';"
4348     "c { ' ' > ;"
4349     "::Null;"
4350     "'a-a' > a\\%|a;",
4351 
4352     // [14]
4353 /*
4354     "::[abc];"
4355     "::BEGIN;"
4356     "abc > xy;"
4357     "::END;"
4358     "::BEGIN;"
4359     "aba > yz;"
4360     "::END;"
4361     "::Upper;",
4362 */
4363     "", // test case commented out below, this is here to keep from messing up the indexes
4364 
4365     // [15]
4366     "::[abc];"
4367     "abc > xy;"
4368     "::Null;"
4369     "aba > yz;"
4370     "::Upper;",
4371 
4372     // [16]
4373 /*
4374     "::[abc];"
4375     "::BEGIN;"
4376     "abc <> xy;"
4377     "::END;"
4378     "::BEGIN;"
4379     "aba <> yz;"
4380     "::END;"
4381     "::Upper(Lower);"
4382     "::([XYZ]);"
4383 */
4384     "", // test case commented out below, this is here to keep from messing up the indexes
4385 
4386     // [17]
4387     "::[abc];"
4388     "abc <> xy;"
4389     "::Null;"
4390     "aba <> yz;"
4391     "::Upper(Lower);"
4392     "::([XYZ]);"
4393 };
4394 
4395 /*
4396 (This entire test is commented out below and will need some heavy revision when we re-add
4397 the ::BEGIN/::END stuff)
4398 static const char* BOGUS_BEGIN_END_RULES[] = {
4399     // [7]
4400     "::BEGIN;"
4401     "abc > xy;"
4402     "::BEGIN;"
4403     "aba > z;"
4404     "::END;"
4405     "::END;",
4406 
4407     // [8]
4408     "abc > xy;"
4409     " aba > z;"
4410     "::END;",
4411 
4412     // [9]
4413     "::BEGIN;"
4414     "::Upper;"
4415     "::END;"
4416 };
4417 static const int32_t BOGUS_BEGIN_END_RULES_length = UPRV_LENGTHOF(BOGUS_BEGIN_END_RULES);
4418 */
4419 
4420 static const char* BEGIN_END_TEST_CASES[] = {
4421     // rules             input                   expected output
4422     BEGIN_END_RULES[0],  "abc ababc aba",        "xy zbc z",
4423 //    BEGIN_END_RULES[1],  "abc ababc aba",        "xy abxy z",
4424 //    BEGIN_END_RULES[2],  "abc ababc aba",        "xy abxy z",
4425 //    BEGIN_END_RULES[3],  "abc ababc aba",        "xy abxy z",
4426     BEGIN_END_RULES[4],  "abc ababc aba",        "xy abxy z",
4427     BEGIN_END_RULES[5],  "abccabaacababcbc",     "PXAARXQBR",
4428 
4429     BEGIN_END_RULES[6],  "e   e - e---e-  e",    "e e e-e-e",
4430     BEGIN_END_RULES[7],  "e   e - e---e-  e",    "e e e-e-e",
4431     BEGIN_END_RULES[8],  "e   e - e---e-  e",    "e e e-e-e",
4432     BEGIN_END_RULES[9],  "e   e - e---e-  e",    "e e e-e-e",
4433 //    BEGIN_END_RULES[10],  "e   e - e---e-  e",    "e e e-e-e",
4434 //    BEGIN_END_RULES[11], "e   e - e---e-  e",    "e e e-e-e",
4435 //    BEGIN_END_RULES[12], "e   e - e---e-  e",    "e e e-e-e",
4436 //    BEGIN_END_RULES[12], "a    a    a    a",     "a%a%a%a",
4437 //    BEGIN_END_RULES[12], "a a-b c b a",          "a%a-b cb-a",
4438     BEGIN_END_RULES[13], "e   e - e---e-  e",    "e e e-e-e",
4439     BEGIN_END_RULES[13], "a    a    a    a",     "a%a%a%a",
4440     BEGIN_END_RULES[13], "a a-b c b a",          "a%a-b cb-a",
4441 
4442 //    BEGIN_END_RULES[14], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4443     BEGIN_END_RULES[15], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4444 //    BEGIN_END_RULES[16], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4445     BEGIN_END_RULES[17], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ"
4446 };
4447 static const int32_t BEGIN_END_TEST_CASES_length = UPRV_LENGTHOF(BEGIN_END_TEST_CASES);
4448 
TestBeginEnd()4449 void TransliteratorTest::TestBeginEnd() {
4450     // run through the list of test cases above
4451     int32_t i = 0;
4452     for (i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4453         expect((UnicodeString)"Test case #" + (i / 3),
4454                UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4455                UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4456                UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4457     }
4458 
4459     // instantiate the one reversible rule set in the reverse direction and make sure it does the right thing
4460     UParseError parseError;
4461     UErrorCode status = U_ZERO_ERROR;
4462     Transliterator* reversed  = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4463             UTRANS_REVERSE, parseError, status);
4464     if (reversed == 0 || U_FAILURE(status)) {
4465         reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4466     } else {
4467         expect(*reversed, UnicodeString("xy XY XYZ yz YZ"), UnicodeString("xy abc xaba yz aba"));
4468     }
4469     delete reversed;
4470 
4471     // finally, run through the list of syntactically-ill-formed rule sets above and make sure
4472     // that all of them cause errors
4473 /*
4474 (commented out until we have the real ::BEGIN/::END stuff in place
4475     for (i = 0; i < BOGUS_BEGIN_END_RULES_length; i++) {
4476         UParseError parseError;
4477         UErrorCode status = U_ZERO_ERROR;
4478         Transliterator* t = Transliterator::createFromRules("foo", UnicodeString(BOGUS_BEGIN_END_RULES[i]),
4479                 UTRANS_FORWARD, parseError, status);
4480         if (!U_FAILURE(status)) {
4481             delete t;
4482             errln((UnicodeString)"Should have gotten syntax error from " + BOGUS_BEGIN_END_RULES[i]);
4483         }
4484     }
4485 */
4486 }
4487 
TestBeginEndToRules()4488 void TransliteratorTest::TestBeginEndToRules() {
4489     // run through the same list of test cases we used above, but this time, instead of just
4490     // instantiating a Transliterator from the rules and running the test against it, we instantiate
4491     // a Transliterator from the rules, do toRules() on it, instantiate a Transliterator from
4492     // the resulting set of rules, and make sure that the generated rule set is semantically equivalent
4493     // to (i.e., does the same thing as) the original rule set
4494     for (int32_t i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4495         UParseError parseError;
4496         UErrorCode status = U_ZERO_ERROR;
4497         Transliterator* t = Transliterator::createFromRules("--", UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4498                 UTRANS_FORWARD, parseError, status);
4499         if (U_FAILURE(status)) {
4500             reportParseError(UnicodeString("FAIL: Couldn't create transliterator"), parseError, status);
4501         } else {
4502             UnicodeString rules;
4503             t->toRules(rules, true);
4504             Transliterator* t2 = Transliterator::createFromRules((UnicodeString)"Test case #" + (i / 3), rules,
4505                     UTRANS_FORWARD, parseError, status);
4506             if (U_FAILURE(status)) {
4507                 reportParseError(UnicodeString("FAIL: Couldn't create transliterator from generated rules"),
4508                         parseError, status);
4509                 delete t;
4510             } else {
4511                 expect(*t2,
4512                        UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4513                        UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4514                 delete t;
4515                 delete t2;
4516             }
4517         }
4518     }
4519 
4520     // do the same thing for the reversible test case
4521     UParseError parseError;
4522     UErrorCode status = U_ZERO_ERROR;
4523     Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4524             UTRANS_REVERSE, parseError, status);
4525     if (U_FAILURE(status)) {
4526         reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4527     } else {
4528         UnicodeString rules;
4529         reversed->toRules(rules, false);
4530         Transliterator* reversed2 = Transliterator::createFromRules("Reversed", rules, UTRANS_FORWARD,
4531                 parseError, status);
4532         if (U_FAILURE(status)) {
4533             reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator from generated rules"),
4534                     parseError, status);
4535             delete reversed;
4536         } else {
4537             expect(*reversed2,
4538                    UnicodeString("xy XY XYZ yz YZ"),
4539                    UnicodeString("xy abc xaba yz aba"));
4540             delete reversed;
4541             delete reversed2;
4542         }
4543     }
4544 }
4545 
TestRegisterAlias()4546 void TransliteratorTest::TestRegisterAlias() {
4547     UnicodeString longID("Lower;[aeiou]Upper");
4548     UnicodeString shortID("Any-CapVowels");
4549     UnicodeString reallyShortID("CapVowels");
4550 
4551     Transliterator::registerAlias(shortID, longID);
4552 
4553     UErrorCode err = U_ZERO_ERROR;
4554     Transliterator* t1 = Transliterator::createInstance(longID, UTRANS_FORWARD, err);
4555     if (U_FAILURE(err)) {
4556         errln("Failed to instantiate transliterator with long ID");
4557         Transliterator::unregister(shortID);
4558         return;
4559     }
4560     Transliterator* t2 = Transliterator::createInstance(reallyShortID, UTRANS_FORWARD, err);
4561     if (U_FAILURE(err)) {
4562         errln("Failed to instantiate transliterator with short ID");
4563         delete t1;
4564         Transliterator::unregister(shortID);
4565         return;
4566     }
4567 
4568     if (t1->getID() != longID)
4569         errln("Transliterator instantiated with long ID doesn't have long ID");
4570     if (t2->getID() != reallyShortID)
4571         errln("Transliterator instantiated with short ID doesn't have short ID");
4572 
4573     UnicodeString rules1;
4574     UnicodeString rules2;
4575 
4576     t1->toRules(rules1, true);
4577     t2->toRules(rules2, true);
4578     if (rules1 != rules2)
4579         errln("Alias transliterators aren't the same");
4580 
4581     delete t1;
4582     delete t2;
4583     Transliterator::unregister(shortID);
4584 
4585     t1 = Transliterator::createInstance(shortID, UTRANS_FORWARD, err);
4586     if (U_SUCCESS(err)) {
4587         errln("Instantiation with short ID succeeded after short ID was unregistered");
4588         delete t1;
4589     }
4590 
4591     // try the same thing again, but this time with something other than
4592     // an instance of CompoundTransliterator
4593     UnicodeString realID("Latin-Greek");
4594     UnicodeString fakeID("Latin-dlgkjdflkjdl");
4595     Transliterator::registerAlias(fakeID, realID);
4596 
4597     err = U_ZERO_ERROR;
4598     t1 = Transliterator::createInstance(realID, UTRANS_FORWARD, err);
4599     if (U_FAILURE(err)) {
4600         dataerrln("Failed to instantiate transliterator with real ID - %s", u_errorName(err));
4601         Transliterator::unregister(realID);
4602         return;
4603     }
4604     t2 = Transliterator::createInstance(fakeID, UTRANS_FORWARD, err);
4605     if (U_FAILURE(err)) {
4606         errln("Failed to instantiate transliterator with fake ID");
4607         delete t1;
4608         Transliterator::unregister(realID);
4609         return;
4610     }
4611 
4612     t1->toRules(rules1, true);
4613     t2->toRules(rules2, true);
4614     if (rules1 != rules2)
4615         errln("Alias transliterators aren't the same");
4616 
4617     delete t1;
4618     delete t2;
4619     Transliterator::unregister(fakeID);
4620 }
4621 
TestRuleStripping()4622 void TransliteratorTest::TestRuleStripping() {
4623     /*
4624 #
4625 \uE001>\u0C01; # SIGN
4626     */
4627     static const UChar rule[] = {
4628         0x0023,0x0020,0x000D,0x000A,
4629         0xE001,0x003E,0x0C01,0x003B,0x0020,0x0023,0x0020,0x0053,0x0049,0x0047,0x004E,0
4630     };
4631     static const UChar expectedRule[] = {
4632         0xE001,0x003E,0x0C01,0x003B,0
4633     };
4634     UChar result[UPRV_LENGTHOF(rule)];
4635     UErrorCode status = U_ZERO_ERROR;
4636     int32_t len = utrans_stripRules(rule, UPRV_LENGTHOF(rule), result, &status);
4637     if (len != u_strlen(expectedRule)) {
4638         errln("utrans_stripRules return len = %d", len);
4639     }
4640     if (u_strncmp(expectedRule, result, len) != 0) {
4641         errln("utrans_stripRules did not return expected string");
4642     }
4643 }
4644 
4645 /**
4646  * Test the Halfwidth-Fullwidth transliterator (ticket 6281).
4647  */
TestHalfwidthFullwidth(void)4648 void TransliteratorTest::TestHalfwidthFullwidth(void) {
4649     UParseError parseError;
4650     UErrorCode status = U_ZERO_ERROR;
4651     Transliterator* hf = Transliterator::createInstance("Halfwidth-Fullwidth", UTRANS_FORWARD, parseError, status);
4652     Transliterator* fh = Transliterator::createInstance("Fullwidth-Halfwidth", UTRANS_FORWARD, parseError, status);
4653     if (hf == 0 || fh == 0) {
4654         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4655         delete hf;
4656         delete fh;
4657         return;
4658     }
4659 
4660     // Array of 2n items
4661     // Each item is
4662     //   "hf"|"fh"|"both",
4663     //   <Halfwidth>,
4664     //   <Fullwidth>
4665     const char* DATA[] = {
4666         "both",
4667         "\\uFFE9\\uFFEA\\uFFEB\\uFFEC\\u0061\\uFF71\\u00AF\\u0020",
4668         "\\u2190\\u2191\\u2192\\u2193\\uFF41\\u30A2\\uFFE3\\u3000",
4669     };
4670     int32_t DATA_length = UPRV_LENGTHOF(DATA);
4671 
4672     for (int32_t i=0; i<DATA_length; i+=3) {
4673         UnicodeString h = CharsToUnicodeString(DATA[i+1]);
4674         UnicodeString f = CharsToUnicodeString(DATA[i+2]);
4675         switch (*DATA[i]) {
4676         case 0x68: //'h': // Halfwidth-Fullwidth only
4677             expect(*hf, h, f);
4678             break;
4679         case 0x66: //'f': // Fullwidth-Halfwidth only
4680             expect(*fh, f, h);
4681             break;
4682         case 0x62: //'b': // both directions
4683             expect(*hf, h, f);
4684             expect(*fh, f, h);
4685             break;
4686         }
4687     }
4688     delete hf;
4689     delete fh;
4690 }
4691 
4692 
4693     /**
4694      *  Test Thai.  The text is the first paragraph of "What is Unicode" from the Unicode.org web site.
4695      *              TODO: confirm that the expected results are correct.
4696      *              For now, test just confirms that C++ and Java give identical results.
4697      */
TestThai(void)4698 void TransliteratorTest::TestThai(void) {
4699 #if !UCONFIG_NO_BREAK_ITERATION
4700     // The expectations in this test heavily depends on the Thai dictionary.
4701     // Therefore, we skip this test under the LSTM configuration.
4702     if (skipDictionaryTest()) {
4703         return;
4704     }
4705     UParseError parseError;
4706     UErrorCode status = U_ZERO_ERROR;
4707     Transliterator* tr = Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
4708     if (tr == 0) {
4709         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4710         return;
4711     }
4712     if (U_FAILURE(status)) {
4713         errln("FAIL: createInstance failed with %s", u_errorName(status));
4714         return;
4715     }
4716     const char *thaiText =
4717         "\\u0e42\\u0e14\\u0e22\\u0e1e\\u0e37\\u0e49\\u0e19\\u0e10\\u0e32\\u0e19\\u0e41\\u0e25\\u0e49\\u0e27, \\u0e04\\u0e2d"
4718         "\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d\\u0e23\\u0e4c\\u0e08\\u0e30\\u0e40\\u0e01\\u0e35\\u0e48\\u0e22"
4719         "\\u0e27\\u0e02\\u0e49\\u0e2d\\u0e07\\u0e01\\u0e31\\u0e1a\\u0e40\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e02\\u0e2d"
4720         "\\u0e07\\u0e15\\u0e31\\u0e27\\u0e40\\u0e25\\u0e02. \\u0e04\\u0e2d\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d"
4721         "\\u0e23\\u0e4c\\u0e08\\u0e31\\u0e14\\u0e40\\u0e01\\u0e47\\u0e1a\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29"
4722         "\\u0e23\\u0e41\\u0e25\\u0e30\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30\\u0e2d\\u0e37\\u0e48\\u0e19\\u0e46 \\u0e42"
4723         "\\u0e14\\u0e22\\u0e01\\u0e32\\u0e23\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25"
4724         "\\u0e02\\u0e43\\u0e2b\\u0e49\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e41\\u0e15\\u0e48\\u0e25\\u0e30\\u0e15"
4725         "\\u0e31\\u0e27. \\u0e01\\u0e48\\u0e2d\\u0e19\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48\\u0e4a Unicode \\u0e08"
4726         "\\u0e30\\u0e16\\u0e39\\u0e01\\u0e2a\\u0e23\\u0e49\\u0e32\\u0e07\\u0e02\\u0e36\\u0e49\\u0e19, \\u0e44\\u0e14\\u0e49"
4727         "\\u0e21\\u0e35\\u0e23\\u0e30\\u0e1a\\u0e1a encoding \\u0e2d\\u0e22\\u0e39\\u0e48\\u0e2b\\u0e25\\u0e32\\u0e22\\u0e23"
4728         "\\u0e49\\u0e2d\\u0e22\\u0e23\\u0e30\\u0e1a\\u0e1a\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e01\\u0e32\\u0e23"
4729         "\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25\\u0e02\\u0e40\\u0e2b\\u0e25\\u0e48"
4730         "\\u0e32\\u0e19\\u0e35\\u0e49. \\u0e44\\u0e21\\u0e48\\u0e21\\u0e35 encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48"
4731         "\\u0e21\\u0e35\\u0e08\\u0e33\\u0e19\\u0e27\\u0e19\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30"
4732         "\\u0e21\\u0e32\\u0e01\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d: \\u0e22\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d"
4733         "\\u0e22\\u0e48\\u0e32\\u0e07\\u0e40\\u0e0a\\u0e48\\u0e19, \\u0e40\\u0e09\\u0e1e\\u0e32\\u0e30\\u0e43\\u0e19\\u0e01"
4734         "\\u0e25\\u0e38\\u0e48\\u0e21\\u0e2a\\u0e2b\\u0e20\\u0e32\\u0e1e\\u0e22\\u0e38\\u0e42\\u0e23\\u0e1b\\u0e40\\u0e1e"
4735         "\\u0e35\\u0e22\\u0e07\\u0e41\\u0e2b\\u0e48\\u0e07\\u0e40\\u0e14\\u0e35\\u0e22\\u0e27 \\u0e01\\u0e47\\u0e15\\u0e49"
4736         "\\u0e2d\\u0e07\\u0e01\\u0e32\\u0e23\\u0e2b\\u0e25\\u0e32\\u0e22 encoding \\u0e43\\u0e19\\u0e01\\u0e32\\u0e23\\u0e04"
4737         "\\u0e23\\u0e2d\\u0e1a\\u0e04\\u0e25\\u0e38\\u0e21\\u0e17\\u0e38\\u0e01\\u0e20\\u0e32\\u0e29\\u0e32\\u0e43\\u0e19"
4738         "\\u0e01\\u0e25\\u0e38\\u0e48\\u0e21. \\u0e2b\\u0e23\\u0e37\\u0e2d\\u0e41\\u0e21\\u0e49\\u0e41\\u0e15\\u0e48\\u0e43"
4739         "\\u0e19\\u0e20\\u0e32\\u0e29\\u0e32\\u0e40\\u0e14\\u0e35\\u0e48\\u0e22\\u0e27 \\u0e40\\u0e0a\\u0e48\\u0e19 \\u0e20"
4740         "\\u0e32\\u0e29\\u0e32\\u0e2d\\u0e31\\u0e07\\u0e01\\u0e24\\u0e29 \\u0e01\\u0e47\\u0e44\\u0e21\\u0e48\\u0e21\\u0e35"
4741         " encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d\\u0e2a\\u0e33\\u0e2b"
4742         "\\u0e23\\u0e31\\u0e1a\\u0e17\\u0e38\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29\\u0e23, \\u0e40\\u0e04"
4743         "\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e27\\u0e23\\u0e23\\u0e04\\u0e15\\u0e2d\\u0e19"
4744         " \\u0e41\\u0e25\\u0e30\\u0e2a\\u0e31\\u0e0d\\u0e25\\u0e31\\u0e01\\u0e29\\u0e13\\u0e4c\\u0e17\\u0e32\\u0e07\\u0e40"
4745         "\\u0e17\\u0e04\\u0e19\\u0e34\\u0e04\\u0e17\\u0e35\\u0e48\\u0e43\\u0e0a\\u0e49\\u0e01\\u0e31\\u0e19\\u0e2d\\u0e22"
4746         "\\u0e39\\u0e48\\u0e17\\u0e31\\u0e48\\u0e27\\u0e44\\u0e1b.";
4747 
4748     const char *latinText =
4749         "doy ph\\u1ee5\\u0304\\u0302n \\u1e6d\\u0304h\\u0101n l\\u00e6\\u0302w, khxmphiwtexr\\u0312 ca ke\\u012b\\u0300"
4750         "ywk\\u0304\\u0125xng k\\u1ea1b re\\u1ee5\\u0304\\u0300xng k\\u0304hxng t\\u1ea1wlek\\u0304h. khxmphiwtexr"
4751         "\\u0312 c\\u1ea1d k\\u0115b t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r l\\u00e6a x\\u1ea1kk\\u0304h ra x\\u1ee5\\u0304"
4752         "\\u0300n\\u00ab doy k\\u0101r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304\\u0131\\u0302 s\\u0304"
4753         "\\u1ea3h\\u0304r\\u1ea1b t\\u00e6\\u0300la t\\u1ea1w. k\\u0300xn h\\u0304n\\u0302\\u0101 th\\u012b\\u0300\\u0301"
4754         " Unicode ca t\\u0304h\\u016bk s\\u0304r\\u0302\\u0101ng k\\u0304h\\u1ee5\\u0302n, d\\u1ecb\\u0302 m\\u012b "
4755         "rabb encoding xy\\u016b\\u0300 h\\u0304l\\u0101y r\\u0302xy rabb s\\u0304\\u1ea3h\\u0304r\\u1ea1b k\\u0101"
4756         "r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304el\\u0300\\u0101 n\\u012b\\u0302. m\\u1ecb\\u0300m"
4757         "\\u012b encoding d\\u0131 th\\u012b\\u0300 m\\u012b c\\u1ea3nwn t\\u1ea1w x\\u1ea1kk\\u0304hra m\\u0101k p"
4758         "he\\u012byng phx: yk t\\u1ea1wx\\u1ef3\\u0101ng ch\\u00e8n, c\\u0304heph\\u0101a n\\u0131 kl\\u00f9m s\\u0304"
4759         "h\\u0304p\\u0323h\\u0101ph yurop phe\\u012byng h\\u0304\\u00e6\\u0300ng de\\u012byw k\\u0306 t\\u0302xngk\\u0101"
4760         "r h\\u0304l\\u0101y encoding n\\u0131 k\\u0101r khrxbkhlum thuk p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 n\\u0131"
4761         " kl\\u00f9m. h\\u0304r\\u1ee5\\u0304x m\\u00e6\\u0302t\\u00e6\\u0300 n\\u0131 p\\u0323h\\u0101s\\u0304\\u02b9"
4762         "\\u0101 de\\u012b\\u0300yw ch\\u00e8n p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 x\\u1ea1ngkvs\\u0304\\u02b9 k\\u0306"
4763         " m\\u1ecb\\u0300m\\u012b encoding d\\u0131 th\\u012b\\u0300 phe\\u012byng phx s\\u0304\\u1ea3h\\u0304r\\u1ea1"
4764         "b thuk t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r, kher\\u1ee5\\u0304\\u0300xngh\\u0304m\\u0101y wrrkh txn l\\u00e6"
4765         "a s\\u0304\\u1ea1\\u1ef5l\\u1ea1ks\\u0304\\u02b9\\u1e47\\u0312 th\\u0101ng thekhnikh th\\u012b\\u0300 ch\\u0131"
4766         "\\u0302 k\\u1ea1n xy\\u016b\\u0300 th\\u1ea1\\u0300wp\\u1ecb.";
4767 
4768 
4769     UnicodeString  xlitText(thaiText);
4770     xlitText = xlitText.unescape();
4771     tr->transliterate(xlitText);
4772 
4773     UnicodeString expectedText(latinText);
4774     expectedText = expectedText.unescape();
4775     expect(*tr, xlitText, expectedText);
4776 
4777     delete tr;
4778 #endif
4779 }
4780 
4781 
4782 //======================================================================
4783 // Support methods
4784 //======================================================================
expectT(const UnicodeString & id,const UnicodeString & source,const UnicodeString & expectedResult)4785 void TransliteratorTest::expectT(const UnicodeString& id,
4786                                  const UnicodeString& source,
4787                                  const UnicodeString& expectedResult) {
4788     UErrorCode ec = U_ZERO_ERROR;
4789     UParseError pe;
4790     Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
4791     if (U_FAILURE(ec)) {
4792         errln((UnicodeString)"FAIL: Could not create " + id + " -  " + u_errorName(ec));
4793         delete t;
4794         return;
4795     }
4796     expect(*t, source, expectedResult);
4797     delete t;
4798 }
4799 
reportParseError(const UnicodeString & message,const UParseError & parseError,const UErrorCode & status)4800 void TransliteratorTest::reportParseError(const UnicodeString& message,
4801                                           const UParseError& parseError,
4802                                           const UErrorCode& status) {
4803     dataerrln(message +
4804           /*", parse error " + parseError.code +*/
4805           ", line " + parseError.line +
4806           ", offset " + parseError.offset +
4807           ", pre-context " + prettify(parseError.preContext, true) +
4808           ", post-context " + prettify(parseError.postContext,true) +
4809           ", Error: " + u_errorName(status));
4810 }
4811 
expect(const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4812 void TransliteratorTest::expect(const UnicodeString& rules,
4813                                 const UnicodeString& source,
4814                                 const UnicodeString& expectedResult,
4815                                 UTransPosition *pos) {
4816     expect("<ID>", rules, source, expectedResult, pos);
4817 }
4818 
expect(const UnicodeString & id,const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4819 void TransliteratorTest::expect(const UnicodeString& id,
4820                                 const UnicodeString& rules,
4821                                 const UnicodeString& source,
4822                                 const UnicodeString& expectedResult,
4823                                 UTransPosition *pos) {
4824     UErrorCode status = U_ZERO_ERROR;
4825     UParseError parseError;
4826     Transliterator* t = Transliterator::createFromRules(id, rules, UTRANS_FORWARD, parseError, status);
4827     if (U_FAILURE(status)) {
4828         reportParseError(UnicodeString("Couldn't create transliterator from ") + rules, parseError, status);
4829     } else {
4830         expect(*t, source, expectedResult, pos);
4831     }
4832     delete t;
4833 }
4834 
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,const Transliterator & reverseTransliterator)4835 void TransliteratorTest::expect(const Transliterator& t,
4836                                 const UnicodeString& source,
4837                                 const UnicodeString& expectedResult,
4838                                 const Transliterator& reverseTransliterator) {
4839     expect(t, source, expectedResult);
4840     expect(reverseTransliterator, expectedResult, source);
4841 }
4842 
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4843 void TransliteratorTest::expect(const Transliterator& t,
4844                                 const UnicodeString& source,
4845                                 const UnicodeString& expectedResult,
4846                                 UTransPosition *pos) {
4847     if (pos == 0) {
4848         UnicodeString result(source);
4849         t.transliterate(result);
4850         expectAux(t.getID() + ":String", source, result, expectedResult);
4851     }
4852     UTransPosition index={0, 0, 0, 0};
4853     if (pos != 0) {
4854         index = *pos;
4855     }
4856 
4857     UnicodeString rsource(source);
4858     if (pos == 0) {
4859         t.transliterate(rsource);
4860     } else {
4861         // Do it all at once -- below we do it incrementally
4862         t.finishTransliteration(rsource, *pos);
4863     }
4864     expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
4865 
4866     // Test keyboard (incremental) transliteration -- this result
4867     // must be the same after we finalize (see below).
4868     UnicodeString log;
4869     rsource.remove();
4870     if (pos != 0) {
4871         rsource = source;
4872         formatInput(log, rsource, index);
4873         log.append(" -> ");
4874         UErrorCode status = U_ZERO_ERROR;
4875         t.transliterate(rsource, index, status);
4876         formatInput(log, rsource, index);
4877     } else {
4878         for (int32_t i=0; i<source.length(); ++i) {
4879             if (i != 0) {
4880                 log.append(" + ");
4881             }
4882             log.append(source.charAt(i)).append(" -> ");
4883             UErrorCode status = U_ZERO_ERROR;
4884             t.transliterate(rsource, index, source.charAt(i), status);
4885             formatInput(log, rsource, index);
4886         }
4887     }
4888 
4889     // As a final step in keyboard transliteration, we must call
4890     // transliterate to finish off any pending partial matches that
4891     // were waiting for more input.
4892     t.finishTransliteration(rsource, index);
4893     log.append(" => ").append(rsource);
4894 
4895     expectAux(t.getID() + ":Keyboard", log,
4896               rsource == expectedResult,
4897               expectedResult);
4898 }
4899 
4900 
4901 /**
4902  * @param appendTo result is appended to this param.
4903  * @param input the string being transliterated
4904  * @param pos the index struct
4905  */
formatInput(UnicodeString & appendTo,const UnicodeString & input,const UTransPosition & pos)4906 UnicodeString& TransliteratorTest::formatInput(UnicodeString &appendTo,
4907                                                const UnicodeString& input,
4908                                                const UTransPosition& pos) {
4909     // Output a string of the form aaa{bbb|ccc|ddd}eee, where
4910     // the {} indicate the context start and limit, and the ||
4911     // indicate the start and limit.
4912     if (0 <= pos.contextStart &&
4913         pos.contextStart <= pos.start &&
4914         pos.start <= pos.limit &&
4915         pos.limit <= pos.contextLimit &&
4916         pos.contextLimit <= input.length()) {
4917 
4918         UnicodeString a, b, c, d, e;
4919         input.extractBetween(0, pos.contextStart, a);
4920         input.extractBetween(pos.contextStart, pos.start, b);
4921         input.extractBetween(pos.start, pos.limit, c);
4922         input.extractBetween(pos.limit, pos.contextLimit, d);
4923         input.extractBetween(pos.contextLimit, input.length(), e);
4924         appendTo.append(a).append((UChar)123/*{*/).append(b).
4925             append((UChar)PIPE).append(c).append((UChar)PIPE).append(d).
4926             append((UChar)125/*}*/).append(e);
4927     } else {
4928         appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
4929                         pos.contextStart + ", s=" + pos.start + ", l=" +
4930                         pos.limit + ", cl=" + pos.contextLimit + "} on " +
4931                         input);
4932     }
4933     return appendTo;
4934 }
4935 
expectAux(const UnicodeString & tag,const UnicodeString & source,const UnicodeString & result,const UnicodeString & expectedResult)4936 void TransliteratorTest::expectAux(const UnicodeString& tag,
4937                                    const UnicodeString& source,
4938                                    const UnicodeString& result,
4939                                    const UnicodeString& expectedResult) {
4940     expectAux(tag, source + " -> " + result,
4941               result == expectedResult,
4942               expectedResult);
4943 }
4944 
expectAux(const UnicodeString & tag,const UnicodeString & summary,UBool pass,const UnicodeString & expectedResult)4945 void TransliteratorTest::expectAux(const UnicodeString& tag,
4946                                    const UnicodeString& summary, UBool pass,
4947                                    const UnicodeString& expectedResult) {
4948     if (pass) {
4949         logln(UnicodeString("(")+tag+") " + prettify(summary));
4950     } else {
4951         dataerrln(UnicodeString("FAIL: (")+tag+") "
4952               + prettify(summary)
4953               + ", expected " + prettify(expectedResult));
4954     }
4955 }
4956 
4957 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
4958