• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1999-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   11/10/99    aliu        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "transtst.h"
18 #include "unicode/locid.h"
19 #include "unicode/dtfmtsym.h"
20 #include "unicode/normlzr.h"
21 #include "unicode/translit.h"
22 #include "unicode/uchar.h"
23 #include "unicode/unifilt.h"
24 #include "unicode/uniset.h"
25 #include "unicode/ustring.h"
26 #include "unicode/usetiter.h"
27 #include "unicode/uscript.h"
28 #include "unicode/utf16.h"
29 #include "cpdtrans.h"
30 #include "nultrans.h"
31 #include "rbt.h"
32 #include "rbt_pars.h"
33 #include "anytrans.h"
34 #include "esctrn.h"
35 #include "name2uni.h"
36 #include "nortrans.h"
37 #include "remtrans.h"
38 #include "titletrn.h"
39 #include "tolowtrn.h"
40 #include "toupptrn.h"
41 #include "unesctrn.h"
42 #include "uni2name.h"
43 #include "cstring.h"
44 #include "cmemory.h"
45 #include <stdio.h>
46 
47 /***********************************************************************
48 
49                      HOW TO USE THIS TEST FILE
50                                -or-
51                   How I developed on two platforms
52                 without losing (too much of) my mind
53 
54 
55 1. Add new tests by copying/pasting/changing existing tests.  On Java,
56    any public void method named Test...() taking no parameters becomes
57    a test.  On C++, you need to modify the header and add a line to
58    the runIndexedTest() dispatch method.
59 
60 2. Make liberal use of the expect() method; it is your friend.
61 
62 3. The tests in this file exactly match those in a sister file on the
63    other side.  The two files are:
64 
65    icu4j:  src/com/ibm/test/translit/TransliteratorTest.java
66    icu4c:  source/test/intltest/transtst.cpp
67 
68                   ==> THIS IS THE IMPORTANT PART <==
69 
70    When you add a test in this file, add it in TransliteratorTest.java
71    too.  Give it the same name and put it in the same relative place.
72    This makes maintenance a lot simpler for any poor soul who ends up
73    trying to synchronize the tests between icu4j and icu4c.
74 
75 4. If you MUST enter a test that is NOT paralleled in the sister file,
76    then add it in the special non-mirrored section.  These are
77    labeled
78 
79      "icu4j ONLY"
80 
81    or
82 
83      "icu4c ONLY"
84 
85    Make sure you document the reason the test is here and not there.
86 
87 
88 Thank you.
89 The Management
90 ***********************************************************************/
91 
92 // Define character constants thusly to be EBCDIC-friendly
93 enum {
94     LEFT_BRACE=((char16_t)0x007B), /*{*/
95     PIPE      =((char16_t)0x007C), /*|*/
96     ZERO      =((char16_t)0x0030), /*0*/
97     UPPER_A   =((char16_t)0x0041)  /*A*/
98 };
99 
TransliteratorTest()100 TransliteratorTest::TransliteratorTest()
101 :   DESERET_DEE((UChar32)0x10414),
102     DESERET_dee((UChar32)0x1043C)
103 {
104 }
105 
~TransliteratorTest()106 TransliteratorTest::~TransliteratorTest() {}
107 
108 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)109 TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
110                                    const char* &name, char* /*par*/) {
111     switch (index) {
112         TESTCASE(0,TestInstantiation);
113         TESTCASE(1,TestSimpleRules);
114         TESTCASE(2,TestRuleBasedInverse);
115         TESTCASE(3,TestKeyboard);
116         TESTCASE(4,TestKeyboard2);
117         TESTCASE(5,TestKeyboard3);
118         TESTCASE(6,TestArabic);
119         TESTCASE(7,TestCompoundKana);
120         TESTCASE(8,TestCompoundHex);
121         TESTCASE(9,TestFiltering);
122         TESTCASE(10,TestInlineSet);
123         TESTCASE(11,TestPatternQuoting);
124         TESTCASE(12,TestJ277);
125         TESTCASE(13,TestJ243);
126         TESTCASE(14,TestJ329);
127         TESTCASE(15,TestSegments);
128         TESTCASE(16,TestCursorOffset);
129         TESTCASE(17,TestArbitraryVariableValues);
130         TESTCASE(18,TestPositionHandling);
131         TESTCASE(19,TestHiraganaKatakana);
132         TESTCASE(20,TestCopyJ476);
133         TESTCASE(21,TestAnchors);
134         TESTCASE(22,TestInterIndic);
135         TESTCASE(23,TestFilterIDs);
136         TESTCASE(24,TestCaseMap);
137         TESTCASE(25,TestNameMap);
138         TESTCASE(26,TestLiberalizedID);
139         TESTCASE(27,TestCreateInstance);
140         TESTCASE(28,TestNormalizationTransliterator);
141         TESTCASE(29,TestCompoundRBT);
142         TESTCASE(30,TestCompoundFilter);
143         TESTCASE(31,TestRemove);
144         TESTCASE(32,TestToRules);
145         TESTCASE(33,TestContext);
146         TESTCASE(34,TestSupplemental);
147         TESTCASE(35,TestQuantifier);
148         TESTCASE(36,TestSTV);
149         TESTCASE(37,TestCompoundInverse);
150         TESTCASE(38,TestNFDChainRBT);
151         TESTCASE(39,TestNullInverse);
152         TESTCASE(40,TestAliasInverseID);
153         TESTCASE(41,TestCompoundInverseID);
154         TESTCASE(42,TestUndefinedVariable);
155         TESTCASE(43,TestEmptyContext);
156         TESTCASE(44,TestCompoundFilterID);
157         TESTCASE(45,TestPropertySet);
158         TESTCASE(46,TestNewEngine);
159         TESTCASE(47,TestQuantifiedSegment);
160         TESTCASE(48,TestDevanagariLatinRT);
161         TESTCASE(49,TestTeluguLatinRT);
162         TESTCASE(50,TestCompoundLatinRT);
163         TESTCASE(51,TestSanskritLatinRT);
164         TESTCASE(52,TestLocaleInstantiation);
165         TESTCASE(53,TestTitleAccents);
166         TESTCASE(54,TestLocaleResource);
167         TESTCASE(55,TestParseError);
168         TESTCASE(56,TestOutputSet);
169         TESTCASE(57,TestVariableRange);
170         TESTCASE(58,TestInvalidPostContext);
171         TESTCASE(59,TestIDForms);
172         TESTCASE(60,TestToRulesMark);
173         TESTCASE(61,TestEscape);
174         TESTCASE(62,TestAnchorMasking);
175         TESTCASE(63,TestDisplayName);
176         TESTCASE(64,TestSpecialCases);
177 #if !UCONFIG_NO_FILE_IO
178         TESTCASE(65,TestIncrementalProgress);
179 #endif
180         TESTCASE(66,TestSurrogateCasing);
181         TESTCASE(67,TestFunction);
182         TESTCASE(68,TestInvalidBackRef);
183         TESTCASE(69,TestMulticharStringSet);
184         TESTCASE(70,TestUserFunction);
185         TESTCASE(71,TestAnyX);
186         TESTCASE(72,TestSourceTargetSet);
187         TESTCASE(73,TestGurmukhiDevanagari);
188         TESTCASE(74,TestPatternWhiteSpace);
189         TESTCASE(75,TestAllCodepoints);
190         TESTCASE(76,TestBoilerplate);
191         TESTCASE(77,TestAlternateSyntax);
192         TESTCASE(78,TestBeginEnd);
193         TESTCASE(79,TestBeginEndToRules);
194         TESTCASE(80,TestRegisterAlias);
195         TESTCASE(81,TestRuleStripping);
196         TESTCASE(82,TestHalfwidthFullwidth);
197         TESTCASE(83,TestThai);
198         TESTCASE(84,TestAny);
199         TESTCASE(85,TestBasicTransliteratorEvenWithoutData);
200         default: name = ""; break;
201     }
202 }
203 
204 /**
205  * Make sure every system transliterator can be instantiated.
206  *
207  * ALSO test that the result of toRules() for each rule is a valid
208  * rule.  Do this here so we don't have to have another test that
209  * instantiates everything as well.
210  */
TestInstantiation()211 void TransliteratorTest::TestInstantiation() {
212     UErrorCode ec = U_ZERO_ERROR;
213     StringEnumeration* avail = Transliterator::getAvailableIDs(ec);
214     assertSuccess("getAvailableIDs()", ec);
215     assertTrue("getAvailableIDs()!=nullptr", avail!=nullptr);
216     int32_t n = Transliterator::countAvailableIDs();
217     assertTrue("getAvailableIDs().count()==countAvailableIDs()",
218                avail->count(ec) == n);
219     assertSuccess("count()", ec);
220     UnicodeString name;
221     for (int32_t i=0; i<n; ++i) {
222         const UnicodeString& id = *avail->snext(ec);
223         if (!assertSuccess("snext()", ec) ||
224             !assertTrue("snext()!=nullptr", (&id)!=nullptr, true)) {
225             break;
226         }
227         UnicodeString id2 = Transliterator::getAvailableID(i);
228         if (id.length() < 1) {
229             errln(UnicodeString("FAIL: getAvailableID(") +
230                   i + ") returned empty string");
231             continue;
232         }
233         if (id != id2) {
234             errln(UnicodeString("FAIL: getAvailableID(") +
235                   i + ") != getAvailableIDs().snext()");
236             continue;
237         }
238         UParseError parseError;
239         UErrorCode status = U_ZERO_ERROR;
240         Transliterator* t = Transliterator::createInstance(id,
241                               UTRANS_FORWARD, parseError,status);
242         name.truncate(0);
243         Transliterator::getDisplayName(id, name);
244         if (t == 0) {
245 #if UCONFIG_NO_BREAK_ITERATION
246             // If UCONFIG_NO_BREAK_ITERATION is on, then only Thai should fail.
247             if (id.compare((UnicodeString)"Thai-Latn") != 0 &&
248                 id.compare((UnicodeString)"Thai-Latin") != 0)
249 #endif
250                 dataerrln(UnicodeString("FAIL: Couldn't create ") + id +
251                       /*", parse error " + parseError.code +*/
252                       ", line " + parseError.line +
253                       ", offset " + parseError.offset +
254                       ", pre-context " + prettify(parseError.preContext, true) +
255                       ", post-context " +prettify(parseError.postContext,true) +
256                       ", Error: " + u_errorName(status));
257                 // When createInstance fails, it deletes the failing
258                 // entry from the available ID list.  We detect this
259                 // here by looking for a change in countAvailableIDs.
260             int32_t nn = Transliterator::countAvailableIDs();
261             if (nn == (n - 1)) {
262                 n = nn;
263                 --i; // Compensate for deleted entry
264             }
265         } else {
266             logln(UnicodeString("OK: ") + name + " (" + id + ")");
267 
268             // Now test toRules
269             UnicodeString rules;
270             t->toRules(rules, true);
271             Transliterator *u = Transliterator::createFromRules("x",
272                                     rules, UTRANS_FORWARD, parseError,status);
273             if (u == 0) {
274                 errln(UnicodeString("FAIL: ") + id +
275                       ".createFromRules() => bad rules" +
276                       /*", parse error " + parseError.code +*/
277                       ", line " + parseError.line +
278                       ", offset " + parseError.offset +
279                       ", context " + prettify(parseError.preContext, true) +
280                       ", rules: " + prettify(rules, true));
281             } else {
282                 delete u;
283             }
284             delete t;
285         }
286     }
287     assertTrue("snext()==nullptr", avail->snext(ec)==nullptr);
288     assertSuccess("snext()", ec);
289     delete avail;
290 
291     // Now test the failure path
292     UParseError parseError;
293     UErrorCode status = U_ZERO_ERROR;
294     UnicodeString id("<Not a valid Transliterator ID>");
295     Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
296     if (t != 0) {
297         errln("FAIL: " + id + " returned a transliterator");
298         delete t;
299     } else {
300         logln("OK: Bogus ID handled properly");
301     }
302 }
303 
TestSimpleRules()304 void TransliteratorTest::TestSimpleRules() {
305     /* Example: rules 1. ab>x|y
306      *                2. yc>z
307      *
308      * []|eabcd  start - no match, copy e to translated buffer
309      * [e]|abcd  match rule 1 - copy output & adjust cursor
310      * [ex|y]cd  match rule 2 - copy output & adjust cursor
311      * [exz]|d   no match, copy d to transliterated buffer
312      * [exzd]|   done
313      */
314     expect(UnicodeString("ab>x|y;", "") +
315            "yc>z",
316            "eabcd", "exzd");
317 
318     /* Another set of rules:
319      *    1. ab>x|yzacw
320      *    2. za>q
321      *    3. qc>r
322      *    4. cw>n
323      *
324      * []|ab       Rule 1
325      * [x|yzacw]   No match
326      * [xy|zacw]   Rule 2
327      * [xyq|cw]    Rule 4
328      * [xyqn]|     Done
329      */
330     expect(UnicodeString("ab>x|yzacw;") +
331            "za>q;" +
332            "qc>r;" +
333            "cw>n",
334            "ab", "xyqn");
335 
336     /* Test categories
337      */
338     UErrorCode status = U_ZERO_ERROR;
339     UParseError parseError;
340     Transliterator *t = Transliterator::createFromRules(
341         "<ID>",
342         UnicodeString("$dummy=").append((char16_t)0xE100) +
343         UnicodeString(";"
344                       "$vowel=[aeiouAEIOU];"
345                       "$lu=[:Lu:];"
346                       "$vowel } $lu > '!';"
347                       "$vowel > '&';"
348                       "'!' { $lu > '^';"
349                       "$lu > '*';"
350                       "a > ERROR", ""),
351         UTRANS_FORWARD, parseError,
352         status);
353     if (U_FAILURE(status)) {
354         dataerrln("FAIL: RBT constructor failed - %s", u_errorName(status));
355         return;
356     }
357     expect(*t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
358     delete t;
359 }
360 
361 /**
362  * Test inline set syntax and set variable syntax.
363  */
TestInlineSet()364 void TransliteratorTest::TestInlineSet() {
365     expect("{ [:Ll:] } x > y; [:Ll:] > z;", "aAbxq", "zAyzz");
366     expect("a[0-9]b > qrs", "1a7b9", "1qrs9");
367 
368     expect(UnicodeString(
369            "$digit = [0-9];"
370            "$alpha = [a-zA-Z];"
371            "$alphanumeric = [$digit $alpha];" // ***
372            "$special = [^$alphanumeric];"     // ***
373            "$alphanumeric > '-';"
374            "$special > '*';", ""),
375 
376            "thx-1138", "---*----");
377 }
378 
379 /**
380  * Create some inverses and confirm that they work.  We have to be
381  * careful how we do this, since the inverses will not be true
382  * inverses -- we can't throw any random string at the composition
383  * of the transliterators and expect the identity function.  F x
384  * F' != I.  However, if we are careful about the input, we will
385  * get the expected results.
386  */
TestRuleBasedInverse()387 void TransliteratorTest::TestRuleBasedInverse() {
388     UnicodeString RULES =
389         UnicodeString("abc>zyx;") +
390         "ab>yz;" +
391         "bc>zx;" +
392         "ca>xy;" +
393         "a>x;" +
394         "b>y;" +
395         "c>z;" +
396 
397         "abc<zyx;" +
398         "ab<yz;" +
399         "bc<zx;" +
400         "ca<xy;" +
401         "a<x;" +
402         "b<y;" +
403         "c<z;" +
404 
405         "";
406 
407     const char* DATA[] = {
408         // Careful here -- random strings will not work.  If we keep
409         // the left side to the domain and the right side to the range
410         // we will be okay though (left, abc; right xyz).
411         "a", "x",
412         "abcacab", "zyxxxyy",
413         "caccb", "xyzzy",
414     };
415 
416     int32_t DATA_length = UPRV_LENGTHOF(DATA);
417 
418     UErrorCode status = U_ZERO_ERROR;
419     UParseError parseError;
420     Transliterator *fwd = Transliterator::createFromRules("<ID>", RULES,
421                                 UTRANS_FORWARD, parseError, status);
422     Transliterator *rev = Transliterator::createFromRules("<ID>", RULES,
423                                 UTRANS_REVERSE, parseError, status);
424     if (U_FAILURE(status)) {
425         errln("FAIL: RBT constructor failed");
426         return;
427     }
428     for (int32_t i=0; i<DATA_length; i+=2) {
429         expect(*fwd, DATA[i], DATA[i+1]);
430         expect(*rev, DATA[i+1], DATA[i]);
431     }
432     delete fwd;
433     delete rev;
434 }
435 
436 /**
437  * Basic test of keyboard.
438  */
TestKeyboard()439 void TransliteratorTest::TestKeyboard() {
440     UParseError parseError;
441     UErrorCode status = U_ZERO_ERROR;
442     Transliterator *t = Transliterator::createFromRules("<ID>",
443                               UnicodeString("psch>Y;")
444                               +"ps>y;"
445                               +"ch>x;"
446                               +"a>A;",
447                               UTRANS_FORWARD, parseError,
448                               status);
449     if (U_FAILURE(status)) {
450         errln("FAIL: RBT constructor failed");
451         return;
452     }
453     const char* DATA[] = {
454         // insertion, buffer
455         "a", "A",
456         "p", "Ap",
457         "s", "Aps",
458         "c", "Apsc",
459         "a", "AycA",
460         "psch", "AycAY",
461         0, "AycAY", // null means finishKeyboardTransliteration
462     };
463 
464     keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
465     delete t;
466 }
467 
468 /**
469  * Basic test of keyboard with cursor.
470  */
TestKeyboard2()471 void TransliteratorTest::TestKeyboard2() {
472     UParseError parseError;
473     UErrorCode status = U_ZERO_ERROR;
474     Transliterator *t = Transliterator::createFromRules("<ID>",
475                               UnicodeString("ych>Y;")
476                               +"ps>|y;"
477                               +"ch>x;"
478                               +"a>A;",
479                               UTRANS_FORWARD, parseError,
480                               status);
481     if (U_FAILURE(status)) {
482         errln("FAIL: RBT constructor failed");
483         return;
484     }
485     const char* DATA[] = {
486         // insertion, buffer
487         "a", "A",
488         "p", "Ap",
489         "s", "Aps", // modified for rollback - "Ay",
490         "c", "Apsc", // modified for rollback - "Ayc",
491         "a", "AycA",
492         "p", "AycAp",
493         "s", "AycAps", // modified for rollback - "AycAy",
494         "c", "AycApsc", // modified for rollback - "AycAyc",
495         "h", "AycAY",
496         0, "AycAY", // null means finishKeyboardTransliteration
497     };
498 
499     keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
500     delete t;
501 }
502 
503 /**
504  * Test keyboard transliteration with back-replacement.
505  */
TestKeyboard3()506 void TransliteratorTest::TestKeyboard3() {
507     // We want th>z but t>y.  Furthermore, during keyboard
508     // transliteration we want t>y then yh>z if t, then h are
509     // typed.
510     UnicodeString RULES("t>|y;"
511                         "yh>z;");
512 
513     const char* DATA[] = {
514         // Column 1: characters to add to buffer (as if typed)
515         // Column 2: expected appearance of buffer after
516         //           keyboard xliteration.
517         "a", "a",
518         "b", "ab",
519         "t", "abt", // modified for rollback - "aby",
520         "c", "abyc",
521         "t", "abyct", // modified for rollback - "abycy",
522         "h", "abycz",
523         0, "abycz", // null means finishKeyboardTransliteration
524     };
525 
526     UParseError parseError;
527     UErrorCode status = U_ZERO_ERROR;
528     Transliterator *t = Transliterator::createFromRules("<ID>", RULES, UTRANS_FORWARD, parseError, status);
529     if (U_FAILURE(status)) {
530         errln("FAIL: RBT constructor failed");
531         return;
532     }
533     keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
534     delete t;
535 }
536 
keyboardAux(const Transliterator & t,const char * DATA[],int32_t DATA_length)537 void TransliteratorTest::keyboardAux(const Transliterator& t,
538                                      const char* DATA[], int32_t DATA_length) {
539     UErrorCode status = U_ZERO_ERROR;
540     UTransPosition index={0, 0, 0, 0};
541     UnicodeString s;
542     for (int32_t i=0; i<DATA_length; i+=2) {
543         UnicodeString log;
544         if (DATA[i] != 0) {
545             log = s + " + "
546                 + DATA[i]
547                 + " -> ";
548             t.transliterate(s, index, DATA[i], status);
549         } else {
550             log = s + " => ";
551             t.finishTransliteration(s, index);
552         }
553         // Show the start index '{' and the cursor '|'
554         UnicodeString a, b, c;
555         s.extractBetween(0, index.contextStart, a);
556         s.extractBetween(index.contextStart, index.start, b);
557         s.extractBetween(index.start, s.length(), c);
558         log.append(a).
559             append((char16_t)LEFT_BRACE).
560             append(b).
561             append((char16_t)PIPE).
562             append(c);
563         if (s == DATA[i+1] && U_SUCCESS(status)) {
564             logln(log);
565         } else {
566             errln(UnicodeString("FAIL: ") + log + ", expected " + DATA[i+1]);
567         }
568     }
569 }
570 
TestArabic()571 void TransliteratorTest::TestArabic() {
572 // Test disabled for 2.0 until new Arabic transliterator can be written.
573 //    /*
574 //    const char* DATA[] = {
575 //        "Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+
576 //                  "\u0627\u0644\u0644\u063a\u0629\u0020"+
577 //                  "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
578 //                  "\u0628\u0628\u0646\u0638\u0645\u0020"+
579 //                  "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
580 //                  "\u062c\u0645\u064a\u0644\u0629",
581 //    };
582 //    */
583 //
584 //    char16_t ar_raw[] = {
585 //        0x062a, 0x062a, 0x0645, 0x062a, 0x0639, 0x0020, 0x0627,
586 //        0x0644, 0x0644, 0x063a, 0x0629, 0x0020, 0x0627, 0x0644,
587 //        0x0639, 0x0631, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
588 //        0x0628, 0x0628, 0x0646, 0x0638, 0x0645, 0x0020, 0x0643,
589 //        0x062a, 0x0627, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
590 //        0x062c, 0x0645, 0x064a, 0x0644, 0x0629, 0
591 //    };
592 //    UnicodeString ar(ar_raw);
593 //    UErrorCode status=U_ZERO_ERROR;
594 //    UParseError parseError;
595 //    Transliterator *t = Transliterator::createInstance("Latin-Arabic", UTRANS_FORWARD, parseError, status);
596 //    if (t == 0) {
597 //        errln("FAIL: createInstance failed");
598 //        return;
599 //    }
600 //    expect(*t, "Arabic", ar);
601 //    delete t;
602 }
603 
604 /**
605  * Compose the Kana transliterator forward and reverse and try
606  * some strings that should come out unchanged.
607  */
TestCompoundKana()608 void TransliteratorTest::TestCompoundKana() {
609     UParseError parseError;
610     UErrorCode status = U_ZERO_ERROR;
611     Transliterator* t = Transliterator::createInstance("Latin-Hiragana;Hiragana-Latin", UTRANS_FORWARD, parseError, status);
612     if (t == 0) {
613         dataerrln("FAIL: construction of Latin-Hiragana;Hiragana-Latin failed - %s", u_errorName(status));
614     } else {
615         expect(*t, "aaaaa", "aaaaa");
616         delete t;
617     }
618 }
619 
620 /**
621  * Compose the hex transliterators forward and reverse.
622  */
TestCompoundHex()623 void TransliteratorTest::TestCompoundHex() {
624     UParseError parseError;
625     UErrorCode status = U_ZERO_ERROR;
626     Transliterator* a = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
627     Transliterator* b = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, parseError, status);
628     Transliterator* transab[] = { a, b };
629     Transliterator* transba[] = { b, a };
630     if (a == 0 || b == 0) {
631         errln("FAIL: construction failed");
632         delete a;
633         delete b;
634         return;
635     }
636     // Do some basic tests of a
637     expect(*a, "01", UnicodeString("\\u0030\\u0031", ""));
638     // Do some basic tests of b
639     expect(*b, UnicodeString("\\u0030\\u0031", ""), "01");
640 
641     Transliterator* ab = new CompoundTransliterator(transab, 2);
642     UnicodeString s("abcde", "");
643     expect(*ab, s, s);
644 
645     UnicodeString str(s);
646     a->transliterate(str);
647     Transliterator* ba = new CompoundTransliterator(transba, 2);
648     expect(*ba, str, str);
649 
650     delete ab;
651     delete ba;
652     delete a;
653     delete b;
654 }
655 
656 int gTestFilterClassID = 0;
657 /**
658  * Used by TestFiltering().
659  */
660 class TestFilter : public UnicodeFilter {
clone() const661     virtual TestFilter* clone() const override {
662         return new TestFilter(*this);
663     }
contains(UChar32 c) const664     virtual UBool contains(UChar32 c) const override {
665         return c != (char16_t)0x0063 /*c*/;
666     }
667     // Stubs
toPattern(UnicodeString & result,UBool) const668     virtual UnicodeString& toPattern(UnicodeString& result,
669                                      UBool /*escapeUnprintable*/) const override {
670         return result;
671     }
matchesIndexValue(uint8_t) const672     virtual UBool matchesIndexValue(uint8_t /*v*/) const override {
673         return false;
674     }
addMatchSetTo(UnicodeSet &) const675     virtual void addMatchSetTo(UnicodeSet& /*toUnionTo*/) const override {}
676 public:
getDynamicClassID() const677     UClassID getDynamicClassID() const override { return (UClassID)&gTestFilterClassID; }
678 };
679 
680 /**
681  * Do some basic tests of filtering.
682  */
TestFiltering()683 void TransliteratorTest::TestFiltering() {
684     UParseError parseError;
685     UErrorCode status = U_ZERO_ERROR;
686     Transliterator* hex = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
687     if (hex == 0) {
688         errln("FAIL: createInstance(Any-Hex) failed");
689         return;
690     }
691     hex->adoptFilter(new TestFilter());
692     UnicodeString s("abcde");
693     hex->transliterate(s);
694     UnicodeString exp("\\u0061\\u0062c\\u0064\\u0065", "");
695     if (s == exp) {
696         logln(UnicodeString("Ok:   \"") + exp + "\"");
697     } else {
698         logln(UnicodeString("FAIL: \"") + s + "\", wanted \"" + exp + "\"");
699     }
700 
701     // ICU4C ONLY. Do not find Transliterator.orphanFilter() in ICU4J.
702     UnicodeFilter *f = hex->orphanFilter();
703     if (f == nullptr){
704         errln("FAIL: orphanFilter() should get a UnicodeFilter");
705     } else {
706         delete f;
707     }
708     delete hex;
709 }
710 
711 /**
712  * Test anchors
713  */
TestAnchors()714 void TransliteratorTest::TestAnchors() {
715     expect(UnicodeString("^a  > 0; a$ > 2 ; a > 1;", ""),
716            "aaa",
717            "012");
718     expect(UnicodeString("$s=[z$]; $s{a>0; a}$s>2; a>1;", ""),
719            "aaa",
720            "012");
721     expect(UnicodeString("^ab  > 01 ;"
722            " ab  > |8 ;"
723            "  b  > k ;"
724            " 8x$ > 45 ;"
725            " 8x  > 77 ;", ""),
726 
727            "ababbabxabx",
728            "018k7745");
729     expect(UnicodeString("$s = [z$] ;"
730            "$s{ab    > 01 ;"
731            "   ab    > |8 ;"
732            "    b    > k ;"
733            "   8x}$s > 45 ;"
734            "   8x    > 77 ;", ""),
735 
736            "abzababbabxzabxabx",
737            "01z018k45z01x45");
738 }
739 
740 /**
741  * Test pattern quoting and escape mechanisms.
742  */
TestPatternQuoting()743 void TransliteratorTest::TestPatternQuoting() {
744     // Array of 3n items
745     // Each item is <rules>, <input>, <expected output>
746     const UnicodeString DATA[] = {
747         UnicodeString(char16_t(0x4E01)) + ">'[male adult]'",
748         UnicodeString(char16_t(0x4E01)),
749         "[male adult]"
750     };
751 
752     for (int32_t i=0; i<3; i+=3) {
753         logln(UnicodeString("Pattern: ") + prettify(DATA[i]));
754         UParseError parseError;
755         UErrorCode status = U_ZERO_ERROR;
756         Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
757         if (U_FAILURE(status)) {
758             errln("RBT constructor failed");
759         } else {
760             expect(*t, DATA[i+1], DATA[i+2]);
761         }
762         delete t;
763     }
764 }
765 
766 /**
767  * Regression test for bugs found in Greek transliteration.
768  */
TestJ277()769 void TransliteratorTest::TestJ277() {
770     UErrorCode status = U_ZERO_ERROR;
771     UParseError parseError;
772     Transliterator *gl = Transliterator::createInstance("Greek-Latin; NFD; [:M:]Remove; NFC", UTRANS_FORWARD, parseError, status);
773     if (gl == nullptr) {
774         dataerrln("FAIL: createInstance(Greek-Latin) returned nullptr - %s", u_errorName(status));
775         return;
776     }
777 
778     char16_t sigma = 0x3C3;
779     char16_t upsilon = 0x3C5;
780     char16_t nu = 0x3BD;
781 //    char16_t PHI = 0x3A6;
782     char16_t alpha = 0x3B1;
783 //    char16_t omega = 0x3C9;
784 //    char16_t omicron = 0x3BF;
785 //    char16_t epsilon = 0x3B5;
786 
787     // sigma upsilon nu -> syn
788     UnicodeString syn;
789     syn.append(sigma).append(upsilon).append(nu);
790     expect(*gl, syn, "syn");
791 
792     // sigma alpha upsilon nu -> saun
793     UnicodeString sayn;
794     sayn.append(sigma).append(alpha).append(upsilon).append(nu);
795     expect(*gl, sayn, "saun");
796 
797     // Again, using a smaller rule set
798     UnicodeString rules(
799                 "$alpha   = \\u03B1;"
800                 "$nu      = \\u03BD;"
801                 "$sigma   = \\u03C3;"
802                 "$ypsilon = \\u03C5;"
803                 "$vowel   = [aeiouAEIOU$alpha$ypsilon];"
804                 "s <>           $sigma;"
805                 "a <>           $alpha;"
806                 "u <>  $vowel { $ypsilon;"
807                 "y <>           $ypsilon;"
808                 "n <>           $nu;",
809                 "");
810     Transliterator *mini = Transliterator::createFromRules("mini", rules, UTRANS_REVERSE, parseError, status);
811     if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
812     expect(*mini, syn, "syn");
813     expect(*mini, sayn, "saun");
814     delete mini;
815     mini = nullptr;
816 
817 #if !UCONFIG_NO_FORMATTING
818     // Transliterate the Greek locale data
819     Locale el("el");
820     DateFormatSymbols syms(el, status);
821     if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
822     int32_t i, count;
823     const UnicodeString* data = syms.getMonths(count);
824     for (i=0; i<count; ++i) {
825         if (data[i].length() == 0) {
826             continue;
827         }
828         UnicodeString out(data[i]);
829         gl->transliterate(out);
830         UBool ok = true;
831         if (data[i].length() >= 2 && out.length() >= 2 &&
832             u_isupper(data[i].charAt(0)) && u_islower(data[i].charAt(1))) {
833             if (!(u_isupper(out.charAt(0)) && u_islower(out.charAt(1)))) {
834                 ok = false;
835             }
836         }
837         if (ok) {
838             logln(prettify(data[i] + " -> " + out));
839         } else {
840             errln(UnicodeString("FAIL: ") + prettify(data[i] + " -> " + out));
841         }
842     }
843 #endif
844 
845     delete gl;
846 }
847 
848 /**
849  * Prefix, suffix support in hex transliterators
850  */
TestJ243()851 void TransliteratorTest::TestJ243() {
852     UErrorCode ec = U_ZERO_ERROR;
853 
854     // Test default Hex-Any, which should handle
855     // \u, \U, u+, and U+
856     Transliterator *hex =
857         Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, ec);
858     if (assertSuccess("getInstance", ec)) {
859         expect(*hex, UnicodeString("\\u0041+\\U00000042,U+0043uU+0044z", ""), "A+B,CuDz");
860     }
861     delete hex;
862 
863 //    // Try a custom Hex-Unicode
864 //    // \uXXXX and &#xXXXX;
865 //    ec = U_ZERO_ERROR;
866 //    HexToUnicodeTransliterator hex2(UnicodeString("\\\\u###0;&\\#x###0\\;", ""), ec);
867 //    expect(hex2, UnicodeString("\\u61\\u062\\u0063\\u00645\\u66x&#x30;&#x031;&#x0032;&#x00033;", ""),
868 //           "abcd5fx012&#x00033;");
869 //    // Try custom Any-Hex (default is tested elsewhere)
870 //    ec = U_ZERO_ERROR;
871 //    UnicodeToHexTransliterator hex3(UnicodeString("&\\#x###0;", ""), ec);
872 //    expect(hex3, "012", "&#x30;&#x31;&#x32;");
873 }
874 
875 /**
876  * Parsers need better syntax error messages.
877  */
TestJ329()878 void TransliteratorTest::TestJ329() {
879 
880     struct { UBool containsErrors; const char* rule; } DATA[] = {
881         { false, "a > b; c > d" },
882         { true,  "a > b; no operator; c > d" },
883     };
884     int32_t DATA_length = UPRV_LENGTHOF(DATA);
885 
886     for (int32_t i=0; i<DATA_length; ++i) {
887         UErrorCode status = U_ZERO_ERROR;
888         UParseError parseError;
889         Transliterator *rbt = Transliterator::createFromRules("<ID>",
890                                     DATA[i].rule,
891                                     UTRANS_FORWARD,
892                                     parseError,
893                                     status);
894         UBool gotError = U_FAILURE(status);
895         UnicodeString desc(DATA[i].rule);
896         desc.append(gotError ? " -> error" : " -> no error");
897         if (gotError) {
898             desc = desc + ", ParseError code=" + u_errorName(status) +
899                 " line=" + parseError.line +
900                 " offset=" + parseError.offset +
901                 " context=" + parseError.preContext;
902         }
903         if (gotError == DATA[i].containsErrors) {
904             logln(UnicodeString("Ok:   ") + desc);
905         } else {
906             errln(UnicodeString("FAIL: ") + desc);
907         }
908         delete rbt;
909     }
910 }
911 
912 /**
913  * Test segments and segment references.
914  */
TestSegments()915 void TransliteratorTest::TestSegments() {
916     // Array of 3n items
917     // Each item is <rules>, <input>, <expected output>
918     UnicodeString DATA[] = {
919         "([a-z]) '.' ([0-9]) > $2 '-' $1",
920         "abc.123.xyz.456",
921         "ab1-c23.xy4-z56",
922 
923         // nested
924         "(([a-z])([0-9])) > $1 '.' $2 '.' $3;",
925         "a1 b2",
926         "a1.a.1 b2.b.2",
927     };
928     int32_t DATA_length = UPRV_LENGTHOF(DATA);
929 
930     for (int32_t i=0; i<DATA_length; i+=3) {
931         logln("Pattern: " + prettify(DATA[i]));
932         UParseError parseError;
933         UErrorCode status = U_ZERO_ERROR;
934         Transliterator *t = Transliterator::createFromRules("ID", DATA[i], UTRANS_FORWARD, parseError, status);
935         if (U_FAILURE(status)) {
936             errln("FAIL: RBT constructor");
937         } else {
938             expect(*t, DATA[i+1], DATA[i+2]);
939         }
940         delete t;
941     }
942 }
943 
944 /**
945  * Test cursor positioning outside of the key
946  */
TestCursorOffset()947 void TransliteratorTest::TestCursorOffset() {
948     // Array of 3n items
949     // Each item is <rules>, <input>, <expected output>
950     UnicodeString DATA[] = {
951         "pre {alpha} post > | @ ALPHA ;"
952         "eALPHA > beta ;"
953         "pre {beta} post > BETA @@ | ;"
954         "post > xyz",
955 
956         "prealphapost prebetapost",
957 
958         "prbetaxyz preBETApost",
959     };
960     int32_t DATA_length = UPRV_LENGTHOF(DATA);
961 
962     for (int32_t i=0; i<DATA_length; i+=3) {
963         logln("Pattern: " + prettify(DATA[i]));
964         UParseError parseError;
965         UErrorCode status = U_ZERO_ERROR;
966         Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
967         if (U_FAILURE(status)) {
968             errln("FAIL: RBT constructor");
969         } else {
970             expect(*t, DATA[i+1], DATA[i+2]);
971         }
972         delete t;
973     }
974 }
975 
976 /**
977  * Test zero length and > 1 char length variable values.  Test
978  * use of variable refs in UnicodeSets.
979  */
TestArbitraryVariableValues()980 void TransliteratorTest::TestArbitraryVariableValues() {
981     // Array of 3n items
982     // Each item is <rules>, <input>, <expected output>
983     UnicodeString DATA[] = {
984         "$abe = ab;"
985         "$pat = x[yY]z;"
986         "$ll  = 'a-z';"
987         "$llZ = [$ll];"
988         "$llY = [$ll$pat];"
989         "$emp = ;"
990 
991         "$abe > ABE;"
992         "$pat > END;"
993         "$llZ > 1;"
994         "$llY > 2;"
995         "7$emp 8 > 9;"
996         "",
997 
998         "ab xYzxyz stY78",
999         "ABE ENDEND 1129",
1000     };
1001     int32_t DATA_length = UPRV_LENGTHOF(DATA);
1002 
1003     for (int32_t i=0; i<DATA_length; i+=3) {
1004         logln("Pattern: " + prettify(DATA[i]));
1005         UParseError parseError;
1006         UErrorCode status = U_ZERO_ERROR;
1007         Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
1008         if (U_FAILURE(status)) {
1009             errln("FAIL: RBT constructor");
1010         } else {
1011             expect(*t, DATA[i+1], DATA[i+2]);
1012         }
1013         delete t;
1014     }
1015 }
1016 
1017 /**
1018  * Confirm that the contextStart, contextLimit, start, and limit
1019  * behave correctly. J474.
1020  */
TestPositionHandling()1021 void TransliteratorTest::TestPositionHandling() {
1022     // Array of 3n items
1023     // Each item is <rules>, <input>, <expected output>
1024     const char* DATA[] = {
1025         "a{t} > SS ; {t}b > UU ; {t} > TT ;",
1026         "xtat txtb", // pos 0,9,0,9
1027         "xTTaSS TTxUUb",
1028 
1029         "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1030         "xtat txtb", // pos 2,9,3,8
1031         "xtaSS TTxUUb",
1032 
1033         "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1034         "xtat txtb", // pos 3,8,3,8
1035         "xtaTT TTxTTb",
1036     };
1037 
1038     // Array of 4n positions -- these go with the DATA array
1039     // They are: contextStart, contextLimit, start, limit
1040     int32_t POS[] = {
1041         0, 9, 0, 9,
1042         2, 9, 3, 8,
1043         3, 8, 3, 8,
1044     };
1045 
1046     int32_t n = UPRV_LENGTHOF(DATA) / 3;
1047     for (int32_t i=0; i<n; i++) {
1048         UErrorCode status = U_ZERO_ERROR;
1049         UParseError parseError;
1050         Transliterator *t = Transliterator::createFromRules("<ID>",
1051                                 DATA[3*i], UTRANS_FORWARD, parseError, status);
1052         if (U_FAILURE(status)) {
1053             delete t;
1054             errln("FAIL: RBT constructor");
1055             return;
1056         }
1057         UTransPosition pos;
1058         pos.contextStart= POS[4*i];
1059         pos.contextLimit = POS[4*i+1];
1060         pos.start = POS[4*i+2];
1061         pos.limit = POS[4*i+3];
1062         UnicodeString rsource(DATA[3*i+1]);
1063         t->transliterate(rsource, pos, status);
1064         if (U_FAILURE(status)) {
1065             delete t;
1066             errln("FAIL: transliterate");
1067             return;
1068         }
1069         t->finishTransliteration(rsource, pos);
1070         expectAux(DATA[3*i],
1071                   DATA[3*i+1],
1072                   rsource,
1073                   DATA[3*i+2]);
1074         delete t;
1075     }
1076 }
1077 
1078 /**
1079  * Test the Hiragana-Katakana transliterator.
1080  */
TestHiraganaKatakana()1081 void TransliteratorTest::TestHiraganaKatakana() {
1082     UParseError parseError;
1083     UErrorCode status = U_ZERO_ERROR;
1084     Transliterator* hk = Transliterator::createInstance("Hiragana-Katakana", UTRANS_FORWARD, parseError, status);
1085     Transliterator* kh = Transliterator::createInstance("Katakana-Hiragana", UTRANS_FORWARD, parseError, status);
1086     if (hk == 0 || kh == 0) {
1087         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1088         delete hk;
1089         delete kh;
1090         return;
1091     }
1092 
1093     // Array of 3n items
1094     // Each item is "hk"|"kh"|"both", <Hiragana>, <Katakana>
1095     const char* DATA[] = {
1096         "both",
1097         "\\u3042\\u3090\\u3099\\u3092\\u3050",
1098         "\\u30A2\\u30F8\\u30F2\\u30B0",
1099 
1100         "kh",
1101         "\\u307C\\u3051\\u3060\\u3042\\u3093\\u30FC",
1102         "\\u30DC\\u30F6\\u30C0\\u30FC\\u30F3\\u30FC",
1103     };
1104     int32_t DATA_length = UPRV_LENGTHOF(DATA);
1105 
1106     for (int32_t i=0; i<DATA_length; i+=3) {
1107         UnicodeString h = CharsToUnicodeString(DATA[i+1]);
1108         UnicodeString k = CharsToUnicodeString(DATA[i+2]);
1109         switch (*DATA[i]) {
1110         case 0x68: //'h': // Hiragana-Katakana
1111             expect(*hk, h, k);
1112             break;
1113         case 0x6B: //'k': // Katakana-Hiragana
1114             expect(*kh, k, h);
1115             break;
1116         case 0x62: //'b': // both
1117             expect(*hk, h, k);
1118             expect(*kh, k, h);
1119             break;
1120         }
1121     }
1122     delete hk;
1123     delete kh;
1124 }
1125 
1126 /**
1127  * Test cloning / copy constructor of RBT.
1128  */
TestCopyJ476()1129 void TransliteratorTest::TestCopyJ476() {
1130     // The real test here is what happens when the destructors are
1131     // called.  So we let one object get destructed, and check to
1132     // see that its copy still works.
1133     Transliterator *t2 = 0;
1134     {
1135         UParseError parseError;
1136         UErrorCode status = U_ZERO_ERROR;
1137         Transliterator *t1 = Transliterator::createFromRules("t1",
1138             "a>A;b>B;'foo'+>'bar'", UTRANS_FORWARD, parseError, status);
1139         if (U_FAILURE(status)) {
1140             errln("FAIL: RBT constructor");
1141             return;
1142         }
1143         t2 = t1->clone(); // Call copy constructor under the covers.
1144         expect(*t1, "abcfoofoo", "ABcbar");
1145         delete t1;
1146     }
1147     expect(*t2, "abcfoofoo", "ABcbar");
1148     delete t2;
1149 }
1150 
1151 /**
1152  * Test inter-Indic transliterators.  These are composed.
1153  * ICU4C Jitterbug 483.
1154  */
TestInterIndic()1155 void TransliteratorTest::TestInterIndic() {
1156     UnicodeString ID("Devanagari-Gujarati", "");
1157     UErrorCode status = U_ZERO_ERROR;
1158     UParseError parseError;
1159     Transliterator* dg = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1160     if (dg == 0) {
1161         dataerrln("FAIL: createInstance(" + ID + ") returned nullptr - " + u_errorName(status));
1162         return;
1163     }
1164     UnicodeString id = dg->getID();
1165     if (id != ID) {
1166         errln("FAIL: createInstance(" + ID + ")->getID() => " + id);
1167     }
1168     UnicodeString dev = CharsToUnicodeString("\\u0901\\u090B\\u0925");
1169     UnicodeString guj = CharsToUnicodeString("\\u0A81\\u0A8B\\u0AA5");
1170     expect(*dg, dev, guj);
1171     delete dg;
1172 }
1173 
1174 /**
1175  * Test filter syntax in IDs. (J918)
1176  */
TestFilterIDs()1177 void TransliteratorTest::TestFilterIDs() {
1178     // Array of 3n strings:
1179     // <id>, <inverse id>, <input>, <expected output>
1180     const char* DATA[] = {
1181         "[aeiou]Any-Hex", // ID
1182         "[aeiou]Hex-Any", // expected inverse ID
1183         "quizzical",      // src
1184         "q\\u0075\\u0069zz\\u0069c\\u0061l", // expected ID.translit(src)
1185 
1186         "[aeiou]Any-Hex;[^5]Hex-Any",
1187         "[^5]Any-Hex;[aeiou]Hex-Any",
1188         "quizzical",
1189         "q\\u0075izzical",
1190 
1191         "[abc]Null",
1192         "[abc]Null",
1193         "xyz",
1194         "xyz",
1195     };
1196     enum { DATA_length = UPRV_LENGTHOF(DATA) };
1197 
1198     for (int i=0; i<DATA_length; i+=4) {
1199         UnicodeString ID(DATA[i], "");
1200         UnicodeString uID(DATA[i+1], "");
1201         UnicodeString data2(DATA[i+2], "");
1202         UnicodeString data3(DATA[i+3], "");
1203         UParseError parseError;
1204         UErrorCode status = U_ZERO_ERROR;
1205         Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1206         if (t == 0) {
1207             errln("FAIL: createInstance(" + ID + ") returned nullptr");
1208             return;
1209         }
1210         expect(*t, data2, data3);
1211 
1212         // Check the ID
1213         if (ID != t->getID()) {
1214             errln("FAIL: createInstance(" + ID + ").getID() => " +
1215                   t->getID());
1216         }
1217 
1218         // Check the inverse
1219         Transliterator *u = t->createInverse(status);
1220         if (u == 0) {
1221             errln("FAIL: " + ID + ".createInverse() returned nullptr");
1222         } else if (u->getID() != uID) {
1223             errln("FAIL: " + ID + ".createInverse().getID() => " +
1224                   u->getID() + ", expected " + uID);
1225         }
1226 
1227         delete t;
1228         delete u;
1229     }
1230 }
1231 
1232 /**
1233  * Test the case mapping transliterators.
1234  */
TestCaseMap()1235 void TransliteratorTest::TestCaseMap() {
1236     UParseError parseError;
1237     UErrorCode status = U_ZERO_ERROR;
1238     Transliterator* toUpper =
1239         Transliterator::createInstance("Any-Upper[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1240     Transliterator* toLower =
1241         Transliterator::createInstance("Any-Lower[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1242     Transliterator* toTitle =
1243         Transliterator::createInstance("Any-Title[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1244     if (toUpper==0 || toLower==0 || toTitle==0) {
1245         errln("FAIL: createInstance returned nullptr");
1246         delete toUpper;
1247         delete toLower;
1248         delete toTitle;
1249         return;
1250     }
1251 
1252     expect(*toUpper, "The quick brown fox jumped over the lazy dogs.",
1253            "THE QUICK BROWN FOx JUMPED OVER THE LAzy DOGS.");
1254     expect(*toLower, "The quIck brown fOX jUMPED OVER THE LAzY dogs.",
1255            "the quick brown foX jumped over the lazY dogs.");
1256     expect(*toTitle, "the quick brown foX can't jump over the laZy dogs.",
1257            "The Quick Brown FoX Can't Jump Over The LaZy Dogs.");
1258 
1259     delete toUpper;
1260     delete toLower;
1261     delete toTitle;
1262 }
1263 
1264 /**
1265  * Test the name mapping transliterators.
1266  */
TestNameMap()1267 void TransliteratorTest::TestNameMap() {
1268     UParseError parseError;
1269     UErrorCode status = U_ZERO_ERROR;
1270     Transliterator* uni2name =
1271         Transliterator::createInstance("Any-Name[^abc]", UTRANS_FORWARD, parseError, status);
1272     Transliterator* name2uni =
1273         Transliterator::createInstance("Name-Any", UTRANS_FORWARD, parseError, status);
1274     if (uni2name==0 || name2uni==0) {
1275         errln("FAIL: createInstance returned nullptr");
1276         delete uni2name;
1277         delete name2uni;
1278         return;
1279     }
1280 
1281     // Careful:  CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1282     expect(*uni2name, CharsToUnicodeString("\\u00A0abc\\u4E01\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF"),
1283            CharsToUnicodeString("\\\\N{NO-BREAK SPACE}abc\\\\N{CJK UNIFIED IDEOGRAPH-4E01}\\\\N{MICRO SIGN}\\\\N{GUJARATI SIGN CANDRABINDU}\\\\N{REPLACEMENT CHARACTER}\\\\N{<control-0004>}\\\\N{<control-0009>}\\\\N{<control-0081>}\\\\N{<noncharacter-FFFF>}"));
1284     expect(*name2uni, UNICODE_STRING_SIMPLE("{\\N { NO-BREAK SPACE}abc\\N{  CJK UNIFIED  IDEOGRAPH-4E01  }\\N{x\\N{MICRO SIGN}\\N{GUJARATI SIGN CANDRABINDU}\\N{REPLACEMENT CHARACTER}\\N{<control-0004>}\\N{<control-0009>}\\N{<control-0081>}\\N{<noncharacter-FFFF>}\\N{<control-0004>}\\N{"),
1285            CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{"));
1286 
1287     delete uni2name;
1288     delete name2uni;
1289 
1290     // round trip
1291     Transliterator* t =
1292         Transliterator::createInstance("Any-Name;Name-Any", UTRANS_FORWARD, parseError, status);
1293     if (t==0) {
1294         errln("FAIL: createInstance returned nullptr");
1295         delete t;
1296         return;
1297     }
1298 
1299     // Careful:  CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1300     UnicodeString s = CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{");
1301     expect(*t, s, s);
1302     delete t;
1303 }
1304 
1305 /**
1306  * Test liberalized ID syntax.  1006c
1307  */
TestLiberalizedID()1308 void TransliteratorTest::TestLiberalizedID() {
1309     // Some test cases have an expected getID() value of nullptr.  This
1310     // means I have disabled the test case for now.  This stuff is
1311     // still under development, and I haven't decided whether to make
1312     // getID() return canonical case yet.  It will all get rewritten
1313     // with the move to Source-Target/Variant IDs anyway. [aliu]
1314     const char* DATA[] = {
1315         "latin-greek", nullptr /*"Latin-Greek"*/, "case insensitivity",
1316         "  Null  ", "Null", "whitespace",
1317         " Latin[a-z]-Greek  ", "[a-z]Latin-Greek", "inline filter",
1318         "  null  ; latin-greek  ", nullptr /*"Null;Latin-Greek"*/, "compound whitespace",
1319     };
1320     const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1321     UParseError parseError;
1322     UErrorCode status= U_ZERO_ERROR;
1323     for (int32_t i=0; i<DATA_length; i+=3) {
1324         Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, parseError, status);
1325         if (t == 0) {
1326             dataerrln(UnicodeString("FAIL: ") + DATA[i+2] +
1327                   " cannot create ID \"" + DATA[i] + "\" - " + u_errorName(status));
1328         } else {
1329             UnicodeString exp;
1330             if (DATA[i+1]) {
1331                 exp = UnicodeString(DATA[i+1], "");
1332             }
1333             // Don't worry about getID() if the expected char*
1334             // is nullptr -- see above.
1335             if (exp.length() == 0 || exp == t->getID()) {
1336                 logln(UnicodeString("Ok: ") + DATA[i+2] +
1337                       " create ID \"" + DATA[i] + "\" => \"" +
1338                       exp + "\"");
1339             } else {
1340                 errln(UnicodeString("FAIL: ") + DATA[i+2] +
1341                       " create ID \"" + DATA[i] + "\" => \"" +
1342                       t->getID() + "\", exp \"" + exp + "\"");
1343             }
1344             delete t;
1345         }
1346     }
1347 }
1348 
1349 /* test for Jitterbug 912 */
TestCreateInstance()1350 void TransliteratorTest::TestCreateInstance(){
1351     const char* FORWARD = "F";
1352     const char* REVERSE = "R";
1353     const char* DATA[] = {
1354         // Column 1: id
1355         // Column 2: direction
1356         // Column 3: expected ID, or "" if expect failure
1357         "Latin-Hangul", REVERSE, "Hangul-Latin", // JB#912
1358 
1359         // JB#2689: bad compound causes crash
1360         "InvalidSource-InvalidTarget", FORWARD, "",
1361         "InvalidSource-InvalidTarget", REVERSE, "",
1362         "Hex-Any;InvalidSource-InvalidTarget", FORWARD, "",
1363         "Hex-Any;InvalidSource-InvalidTarget", REVERSE, "",
1364         "InvalidSource-InvalidTarget;Hex-Any", FORWARD, "",
1365         "InvalidSource-InvalidTarget;Hex-Any", REVERSE, "",
1366 
1367         nullptr
1368     };
1369 
1370     for (int32_t i=0; DATA[i]; i+=3) {
1371         UParseError err;
1372         UErrorCode ec = U_ZERO_ERROR;
1373         UnicodeString id(DATA[i]);
1374         UTransDirection dir = (DATA[i+1]==FORWARD)?
1375             UTRANS_FORWARD:UTRANS_REVERSE;
1376         UnicodeString expID(DATA[i+2]);
1377         Transliterator* t =
1378             Transliterator::createInstance(id,dir,err,ec);
1379         UnicodeString newID;
1380         if (t) {
1381             newID = t->getID();
1382         }
1383         UBool ok = (newID == expID);
1384         if (!t) {
1385             newID = u_errorName(ec);
1386         }
1387         if (ok) {
1388             logln((UnicodeString)"Ok: createInstance(" +
1389                   id + "," + DATA[i+1] + ") => " + newID);
1390         } else {
1391             dataerrln((UnicodeString)"FAIL: createInstance(" +
1392                   id + "," + DATA[i+1] + ") => " + newID +
1393                   ", expected " + expID);
1394         }
1395         delete t;
1396     }
1397 }
1398 
1399 /**
1400  * Test the normalization transliterator.
1401  */
TestNormalizationTransliterator()1402 void TransliteratorTest::TestNormalizationTransliterator() {
1403     // THE FOLLOWING TWO TABLES ARE COPIED FROM com.ibm.test.normalizer.BasicTest
1404     // PLEASE KEEP THEM IN SYNC WITH BasicTest.
1405     const char* CANON[] = {
1406         // Input               Decomposed            Composed
1407         "cat",                "cat",                "cat"               ,
1408         "\\u00e0ardvark",      "a\\u0300ardvark",     "\\u00e0ardvark"    ,
1409 
1410         "\\u1e0a",             "D\\u0307",            "\\u1e0a"            , // D-dot_above
1411         "D\\u0307",            "D\\u0307",            "\\u1e0a"            , // D dot_above
1412 
1413         "\\u1e0c\\u0307",       "D\\u0323\\u0307",      "\\u1e0c\\u0307"      , // D-dot_below dot_above
1414         "\\u1e0a\\u0323",       "D\\u0323\\u0307",      "\\u1e0c\\u0307"      , // D-dot_above dot_below
1415         "D\\u0307\\u0323",      "D\\u0323\\u0307",      "\\u1e0c\\u0307"      , // D dot_below dot_above
1416 
1417         "\\u1e10\\u0307\\u0323", "D\\u0327\\u0323\\u0307","\\u1e10\\u0323\\u0307", // D dot_below cedilla dot_above
1418         "D\\u0307\\u0328\\u0323","D\\u0328\\u0323\\u0307","\\u1e0c\\u0328\\u0307", // D dot_above ogonek dot_below
1419 
1420         "\\u1E14",             "E\\u0304\\u0300",      "\\u1E14"            , // E-macron-grave
1421         "\\u0112\\u0300",       "E\\u0304\\u0300",      "\\u1E14"            , // E-macron + grave
1422         "\\u00c8\\u0304",       "E\\u0300\\u0304",      "\\u00c8\\u0304"      , // E-grave + macron
1423 
1424         "\\u212b",             "A\\u030a",            "\\u00c5"            , // angstrom_sign
1425         "\\u00c5",             "A\\u030a",            "\\u00c5"            , // A-ring
1426 
1427         "\\u00fdffin",         "y\\u0301ffin",        "\\u00fdffin"        ,    //updated with 3.0
1428         "\\u00fd\\uFB03n",      "y\\u0301\\uFB03n",     "\\u00fd\\uFB03n"     , //updated with 3.0
1429 
1430         "Henry IV",           "Henry IV",           "Henry IV"          ,
1431         "Henry \\u2163",       "Henry \\u2163",       "Henry \\u2163"      ,
1432 
1433         "\\u30AC",             "\\u30AB\\u3099",       "\\u30AC"            , // ga (Katakana)
1434         "\\u30AB\\u3099",       "\\u30AB\\u3099",       "\\u30AC"            , // ka + ten
1435         "\\uFF76\\uFF9E",       "\\uFF76\\uFF9E",       "\\uFF76\\uFF9E"      , // hw_ka + hw_ten
1436         "\\u30AB\\uFF9E",       "\\u30AB\\uFF9E",       "\\u30AB\\uFF9E"      , // ka + hw_ten
1437         "\\uFF76\\u3099",       "\\uFF76\\u3099",       "\\uFF76\\u3099"      , // hw_ka + ten
1438 
1439         "A\\u0300\\u0316",      "A\\u0316\\u0300",      "\\u00C0\\u0316"      ,
1440         0 // end
1441     };
1442 
1443     const char* COMPAT[] = {
1444         // Input               Decomposed            Composed
1445         "\\uFB4f",             "\\u05D0\\u05DC",       "\\u05D0\\u05DC"     , // Alef-Lamed vs. Alef, Lamed
1446 
1447         "\\u00fdffin",         "y\\u0301ffin",        "\\u00fdffin"        ,    //updated for 3.0
1448         "\\u00fd\\uFB03n",      "y\\u0301ffin",        "\\u00fdffin"        , // ffi ligature -> f + f + i
1449 
1450         "Henry IV",           "Henry IV",           "Henry IV"          ,
1451         "Henry \\u2163",       "Henry IV",           "Henry IV"          ,
1452 
1453         "\\u30AC",             "\\u30AB\\u3099",       "\\u30AC"            , // ga (Katakana)
1454         "\\u30AB\\u3099",       "\\u30AB\\u3099",       "\\u30AC"            , // ka + ten
1455 
1456         "\\uFF76\\u3099",       "\\u30AB\\u3099",       "\\u30AC"            , // hw_ka + ten
1457         0 // end
1458     };
1459 
1460     int32_t i;
1461     UParseError parseError;
1462     UErrorCode status = U_ZERO_ERROR;
1463     Transliterator* NFD = Transliterator::createInstance("NFD", UTRANS_FORWARD, parseError, status);
1464     Transliterator* NFC = Transliterator::createInstance("NFC", UTRANS_FORWARD, parseError, status);
1465     if (!NFD || !NFC) {
1466         dataerrln("FAIL: createInstance failed: %s", u_errorName(status));
1467         delete NFD;
1468         delete NFC;
1469         return;
1470     }
1471     for (i=0; CANON[i]; i+=3) {
1472         UnicodeString in = CharsToUnicodeString(CANON[i]);
1473         UnicodeString expd = CharsToUnicodeString(CANON[i+1]);
1474         UnicodeString expc = CharsToUnicodeString(CANON[i+2]);
1475         expect(*NFD, in, expd);
1476         expect(*NFC, in, expc);
1477     }
1478     delete NFD;
1479     delete NFC;
1480 
1481     Transliterator* NFKD = Transliterator::createInstance("NFKD", UTRANS_FORWARD, parseError, status);
1482     Transliterator* NFKC = Transliterator::createInstance("NFKC", UTRANS_FORWARD, parseError, status);
1483     if (!NFKD || !NFKC) {
1484         dataerrln("FAIL: createInstance failed");
1485         delete NFKD;
1486         delete NFKC;
1487         return;
1488     }
1489     for (i=0; COMPAT[i]; i+=3) {
1490         UnicodeString in = CharsToUnicodeString(COMPAT[i]);
1491         UnicodeString expkd = CharsToUnicodeString(COMPAT[i+1]);
1492         UnicodeString expkc = CharsToUnicodeString(COMPAT[i+2]);
1493         expect(*NFKD, in, expkd);
1494         expect(*NFKC, in, expkc);
1495     }
1496     delete NFKD;
1497     delete NFKC;
1498 
1499     UParseError pe;
1500     status = U_ZERO_ERROR;
1501     Transliterator *t = Transliterator::createInstance("NFD; [x]Remove",
1502                                                        UTRANS_FORWARD,
1503                                                        pe, status);
1504     if (t == 0) {
1505         errln("FAIL: createInstance failed");
1506     }
1507     expect(*t, CharsToUnicodeString("\\u010dx"),
1508            CharsToUnicodeString("c\\u030C"));
1509     delete t;
1510 }
1511 
1512 /**
1513  * Test we can create basic transliterator even without data.
1514  */
TestBasicTransliteratorEvenWithoutData()1515 void TransliteratorTest::TestBasicTransliteratorEvenWithoutData() {
1516     const char16_t* TEST_DATA = u"\u0124e\u0301 \uFB01nd x";
1517     const char16_t* EXPECTED_RESULTS[] = {
1518         u"H\u0302e\u0301 \uFB01nd x",  // NFD
1519         u"\u0124\u00E9 \uFB01nd x",  // NFC
1520         u"H\u0302e\u0301 find x",  // NFKD
1521         u"\u0124\u00E9 find x",  // NFKC
1522         u"\u0124e\u0301 \uFB01nd x",  // Hex-Any
1523         u"\u0125e\u0301 \uFB01nd x",  // Lower
1524         u"\u0124e\uFB01ndx",  // [:^L:]Remove
1525         u"H\u0302e\u0301 \uFB01nd ",  // NFD; [x]Remove
1526         u"h\u0302e\u0301 find x",  // Lower; NFKD;
1527         u"hefindx",  // Lower; NFKD; [:^L:]Remove; NFC;
1528         u"\u0124e \uFB01nd x",  // [:Nonspacing Mark:] Remove;
1529         u"He \uFB01nd x",  // NFD; [:Nonspacing Mark:] Remove; NFC;
1530         // end
1531         0
1532     };
1533 
1534     const char* BASIC_TRANSLITERATOR_ID[] = {
1535         "NFD",
1536         "NFC",
1537         "NFKD",
1538         "NFKC",
1539         "Hex-Any",
1540         "Lower",
1541         "[:^L:]Remove",
1542         "NFD; [x]Remove",
1543         "Lower; NFKD;",
1544         "Lower; NFKD; [:^L:]Remove; NFC;",
1545         "[:Nonspacing Mark:] Remove;",
1546         "NFD; [:Nonspacing Mark:] Remove; NFC;",
1547         // end
1548         0
1549     };
1550     const char* BASIC_TRANSLITERATOR_RULES[] = {
1551         "::Lower; ::NFKD;",
1552         "::Lower; ::NFKD; ::[:^L:]Remove; ::NFC;",
1553         "::[:Nonspacing Mark:] Remove;",
1554         "::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;",
1555         // end
1556         0
1557     };
1558     for (int32_t i=0; BASIC_TRANSLITERATOR_ID[i]; i++) {
1559         UErrorCode status = U_ZERO_ERROR;
1560         UParseError parseError;
1561         std::unique_ptr<Transliterator> translit(Transliterator::createInstance(
1562             BASIC_TRANSLITERATOR_ID[i], UTRANS_FORWARD, parseError, status));
1563         if (translit.get() == nullptr || !U_SUCCESS(status)) {
1564             dataerrln("FAIL: createInstance %s failed", BASIC_TRANSLITERATOR_ID[i]);
1565             continue;
1566         }
1567         UnicodeString data(TEST_DATA);
1568         UnicodeString expected(EXPECTED_RESULTS[i]);
1569         translit->transliterate(data);
1570         if (data != expected) {
1571             dataerrln(UnicodeString("FAIL: expected translit(") +
1572                       BASIC_TRANSLITERATOR_ID[i] + ") = '" +
1573                       EXPECTED_RESULTS[i] + "' but got '" + data);
1574             continue;
1575         }
1576     }
1577     for (int32_t i=0; BASIC_TRANSLITERATOR_RULES[i]; i++) {
1578         UErrorCode status = U_ZERO_ERROR;
1579         UParseError parseError;
1580         std::unique_ptr<Transliterator> translit(Transliterator::createFromRules(
1581             "Test",
1582             BASIC_TRANSLITERATOR_RULES[i], UTRANS_FORWARD, parseError, status));
1583         if (translit.get() == nullptr || !U_SUCCESS(status)) {
1584             dataerrln("FAIL: createFromRules %s failed", BASIC_TRANSLITERATOR_RULES[i]);
1585             continue;
1586         }
1587     }
1588 }
1589 
1590 /**
1591  * Test compound RBT rules.
1592  */
TestCompoundRBT()1593 void TransliteratorTest::TestCompoundRBT() {
1594     // Careful with spacing and ';' here:  Phrase this exactly
1595     // as toRules() is going to return it.  If toRules() changes
1596     // with regard to spacing or ';', then adjust this string.
1597     UnicodeString rule("::Hex-Any;\n"
1598                        "::Any-Lower;\n"
1599                        "a > '.A.';\n"
1600                        "b > '.B.';\n"
1601                        "::[^t]Any-Upper;", "");
1602     UParseError parseError;
1603     UErrorCode status = U_ZERO_ERROR;
1604     Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, parseError, status);
1605     if (t == 0) {
1606         errln("FAIL: createFromRules failed");
1607         return;
1608     }
1609     expect(*t, UNICODE_STRING_SIMPLE("\\u0043at in the hat, bat on the mat"),
1610            "C.A.t IN tHE H.A.t, .B..A.t ON tHE M.A.t");
1611     UnicodeString r;
1612     t->toRules(r, true);
1613     if (r == rule) {
1614         logln((UnicodeString)"OK: toRules() => " + r);
1615     } else {
1616         errln((UnicodeString)"FAIL: toRules() => " + r +
1617               ", expected " + rule);
1618     }
1619     delete t;
1620 
1621     // Now test toRules
1622     t = Transliterator::createInstance("Greek-Latin; Latin-Cyrillic", UTRANS_FORWARD, parseError, status);
1623     if (t == 0) {
1624         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1625         return;
1626     }
1627     UnicodeString exp("::Greek-Latin;\n::Latin-Cyrillic;");
1628     t->toRules(r, true);
1629     if (r != exp) {
1630         errln((UnicodeString)"FAIL: toRules() => " + r +
1631               ", expected " + exp);
1632     } else {
1633         logln((UnicodeString)"OK: toRules() => " + r);
1634     }
1635     delete t;
1636 
1637     // Round trip the result of toRules
1638     t = Transliterator::createFromRules("Test", r, UTRANS_FORWARD, parseError, status);
1639     if (t == 0) {
1640         errln("FAIL: createFromRules #2 failed");
1641         return;
1642     } else {
1643         logln((UnicodeString)"OK: createFromRules(" + r + ") succeeded");
1644     }
1645 
1646     // Test toRules again
1647     t->toRules(r, true);
1648     if (r != exp) {
1649         errln((UnicodeString)"FAIL: toRules() => " + r +
1650               ", expected " + exp);
1651     } else {
1652         logln((UnicodeString)"OK: toRules() => " + r);
1653     }
1654 
1655     delete t;
1656 
1657     // Test Foo(Bar) IDs.  Careful with spacing in id; make it conform
1658     // to what the regenerated ID will look like.
1659     UnicodeString id("Upper(Lower);(NFKC)", "");
1660     t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
1661     if (t == 0) {
1662         errln("FAIL: createInstance #2 failed");
1663         return;
1664     }
1665     if (t->getID() == id) {
1666         logln((UnicodeString)"OK: created " + id);
1667     } else {
1668         errln((UnicodeString)"FAIL: createInstance(" + id +
1669               ").getID() => " + t->getID());
1670     }
1671 
1672     Transliterator *u = t->createInverse(status);
1673     if (u == 0) {
1674         errln("FAIL: createInverse failed");
1675         delete t;
1676         return;
1677     }
1678     exp = "NFKC();Lower(Upper)";
1679     if (u->getID() == exp) {
1680         logln((UnicodeString)"OK: createInverse(" + id + ") => " +
1681               u->getID());
1682     } else {
1683         errln((UnicodeString)"FAIL: createInverse(" + id + ") => " +
1684               u->getID());
1685     }
1686     delete t;
1687     delete u;
1688 }
1689 
1690 /**
1691  * Compound filter semantics were originally not implemented
1692  * correctly.  Originally, each component filter f(i) is replaced by
1693  * f'(i) = f(i) && g, where g is the filter for the compound
1694  * transliterator.
1695  *
1696  * From Mark:
1697  *
1698  * Suppose and I have a transliterator X. Internally X is
1699  * "Greek-Latin; Latin-Cyrillic; Any-Lower". I use a filter [^A].
1700  *
1701  * The compound should convert all greek characters (through latin) to
1702  * cyrillic, then lowercase the result. The filter should say "don't
1703  * touch 'A' in the original". But because an intermediate result
1704  * happens to go through "A", the Greek Alpha gets hung up.
1705  */
TestCompoundFilter()1706 void TransliteratorTest::TestCompoundFilter() {
1707     UParseError parseError;
1708     UErrorCode status = U_ZERO_ERROR;
1709     Transliterator *t = Transliterator::createInstance
1710         ("Greek-Latin; Latin-Greek; Lower", UTRANS_FORWARD, parseError, status);
1711     if (t == 0) {
1712         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1713         return;
1714     }
1715     t->adoptFilter(new UnicodeSet("[^A]", status));
1716     if (U_FAILURE(status)) {
1717         errln("FAIL: UnicodeSet ct failed");
1718         delete t;
1719         return;
1720     }
1721 
1722     // Only the 'A' at index 1 should remain unchanged
1723     expect(*t,
1724            CharsToUnicodeString("BA\\u039A\\u0391"),
1725            CharsToUnicodeString("\\u03b2A\\u03ba\\u03b1"));
1726     delete t;
1727 }
1728 
TestRemove()1729 void TransliteratorTest::TestRemove() {
1730     UParseError parseError;
1731     UErrorCode status = U_ZERO_ERROR;
1732     Transliterator *t = Transliterator::createInstance("Remove[abc]", UTRANS_FORWARD, parseError, status);
1733     if (t == 0) {
1734         errln("FAIL: createInstance failed");
1735         return;
1736     }
1737 
1738     expect(*t, "Able bodied baker's cats", "Ale odied ker's ts");
1739 
1740     // extra test for RemoveTransliterator::clone(), which at one point wasn't
1741     // duplicating the filter
1742     Transliterator* t2 = t->clone();
1743     expect(*t2, "Able bodied baker's cats", "Ale odied ker's ts");
1744 
1745     delete t;
1746     delete t2;
1747 }
1748 
TestToRules()1749 void TransliteratorTest::TestToRules() {
1750     const char* RBT = "rbt";
1751     const char* SET = "set";
1752     static const char* DATA[] = {
1753         RBT,
1754         "$a=\\u4E61; [$a] > A;",
1755         "[\\u4E61] > A;",
1756 
1757         RBT,
1758         "$white=[[:Zs:][:Zl:]]; $white{a} > A;",
1759         "[[:Zs:][:Zl:]]{a} > A;",
1760 
1761         SET,
1762         "[[:Zs:][:Zl:]]",
1763         "[[:Zs:][:Zl:]]",
1764 
1765         SET,
1766         "[:Ps:]",
1767         "[:Ps:]",
1768 
1769         SET,
1770         "[:L:]",
1771         "[:L:]",
1772 
1773         SET,
1774         "[[:L:]-[A]]",
1775         "[[:L:]-[A]]",
1776 
1777         SET,
1778         "[~[:Lu:][:Ll:]]",
1779         "[~[:Lu:][:Ll:]]",
1780 
1781         SET,
1782         "[~[a-z]]",
1783         "[~[a-z]]",
1784 
1785         RBT,
1786         "$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
1787         "[^[:Zs:]]{a} > A;",
1788 
1789         RBT,
1790         "$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
1791         "[[a-z]-[:Zs:]]{a} > A;",
1792 
1793         RBT,
1794         "$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
1795         "[[:Zs:]&[a-z]]{a} > A;",
1796 
1797         RBT,
1798         "$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
1799         "[x[:Zs:]]{a} > A;",
1800 
1801         RBT,
1802         "$accentMinus = [ [\\u0300-\\u0345] & [:M:] - [\\u0338]] ;"
1803         "$macron = \\u0304 ;"
1804         "$evowel = [aeiouyAEIOUY] ;"
1805         "$iotasub = \\u0345 ;"
1806         "($evowel $macron $accentMinus *) i > | $1 $iotasub ;",
1807         "([AEIOUYaeiouy]\\u0304[[\\u0300-\\u0345]&[:M:]-[\\u0338]]*)i > | $1 \\u0345;",
1808 
1809         RBT,
1810         "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1811         "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1812     };
1813     static const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1814 
1815     for (int32_t d=0; d < DATA_length; d+=3) {
1816         if (DATA[d] == RBT) {
1817             // Transliterator test
1818             UParseError parseError;
1819             UErrorCode status = U_ZERO_ERROR;
1820             Transliterator *t = Transliterator::createFromRules("ID",
1821                                                                 UnicodeString(DATA[d+1], -1, US_INV), UTRANS_FORWARD, parseError, status);
1822             if (t == 0) {
1823                 dataerrln("FAIL: createFromRules failed - %s", u_errorName(status));
1824                 return;
1825             }
1826             UnicodeString rules, escapedRules;
1827             t->toRules(rules, false);
1828             t->toRules(escapedRules, true);
1829             UnicodeString expRules = CharsToUnicodeString(DATA[d+2]);
1830             UnicodeString expEscapedRules(DATA[d+2], -1, US_INV);
1831             if (rules == expRules) {
1832                 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1833                       " => " + rules);
1834             } else {
1835                 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1836                       " => " + rules + ", exp " + expRules);
1837             }
1838             if (escapedRules == expEscapedRules) {
1839                 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1840                       " => " + escapedRules);
1841             } else {
1842                 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1843                       " => " + escapedRules + ", exp " + expEscapedRules);
1844             }
1845             delete t;
1846 
1847         } else {
1848             // UnicodeSet test
1849             UErrorCode status = U_ZERO_ERROR;
1850             UnicodeString pat(DATA[d+1], -1, US_INV);
1851             UnicodeString expToPat(DATA[d+2], -1, US_INV);
1852             UnicodeSet set(pat, status);
1853             if (U_FAILURE(status)) {
1854                 errln("FAIL: UnicodeSet ct failed");
1855                 return;
1856             }
1857             // Adjust spacing etc. as necessary.
1858             UnicodeString toPat;
1859             set.toPattern(toPat);
1860             if (expToPat == toPat) {
1861                 logln((UnicodeString)"Ok: " + pat +
1862                       " => " + toPat);
1863             } else {
1864                 errln((UnicodeString)"FAIL: " + pat +
1865                       " => " + prettify(toPat, true) +
1866                       ", exp " + prettify(pat, true));
1867             }
1868         }
1869     }
1870 }
1871 
TestContext()1872 void TransliteratorTest::TestContext() {
1873     UTransPosition pos = {0, 2, 0, 1}; // cs cl s l
1874     expect("de > x; {d}e > y;",
1875            "de",
1876            "ye",
1877            &pos);
1878 
1879     expect("ab{c} > z;",
1880            "xadabdabcy",
1881            "xadabdabzy");
1882 }
1883 
TestSupplemental()1884 void TransliteratorTest::TestSupplemental() {
1885 
1886     expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];"
1887                                 "a > $a; $s > i;"),
1888            CharsToUnicodeString("ab\\U0001030Fx"),
1889            CharsToUnicodeString("\\U00010300bix"));
1890 
1891     expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];"
1892                                 "$b=[A-Z\\U00010400-\\U0001044D];"
1893                                 "($a)($b) > $2 $1;"),
1894            CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
1895            CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
1896 
1897     // k|ax\\U00010300xm
1898 
1899     // k|a\\U00010400\\U00010300xm
1900     // ky|\\U00010400\\U00010300xm
1901     // ky\\U00010400|\\U00010300xm
1902 
1903     // ky\\U00010400|\\U00010300\\U00010400m
1904     // ky\\U00010400y|\\U00010400m
1905     expect(CharsToUnicodeString("$a=[a\\U00010300-\\U00010323];"
1906                                 "$a {x} > | @ \\U00010400;"
1907                                 "{$a} [^\\u0000-\\uFFFF] > y;"),
1908            CharsToUnicodeString("kax\\U00010300xm"),
1909            CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
1910 
1911     expectT("Any-Name",
1912            CharsToUnicodeString("\\U00010330\\U000E0061\\u00A0"),
1913            UNICODE_STRING_SIMPLE("\\N{GOTHIC LETTER AHSA}\\N{TAG LATIN SMALL LETTER A}\\N{NO-BREAK SPACE}"));
1914 
1915     expectT("Any-Hex/Unicode",
1916            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1917            UNICODE_STRING_SIMPLE("U+10330U+10FF00U+E0061U+00A0"));
1918 
1919     expectT("Any-Hex/C",
1920            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1921            UNICODE_STRING_SIMPLE("\\U00010330\\U0010FF00\\U000E0061\\u00A0"));
1922 
1923     expectT("Any-Hex/Perl",
1924            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1925            UNICODE_STRING_SIMPLE("\\x{10330}\\x{10FF00}\\x{E0061}\\x{A0}"));
1926 
1927     expectT("Any-Hex/Java",
1928            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1929            UNICODE_STRING_SIMPLE("\\uD800\\uDF30\\uDBFF\\uDF00\\uDB40\\uDC61\\u00A0"));
1930 
1931     expectT("Any-Hex/XML",
1932            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1933            "&#x10330;&#x10FF00;&#xE0061;&#xA0;");
1934 
1935     expectT("Any-Hex/XML10",
1936            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1937            "&#66352;&#1113856;&#917601;&#160;");
1938 
1939     expectT(UNICODE_STRING_SIMPLE("[\\U000E0000-\\U000E0FFF] Remove"),
1940            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1941            CharsToUnicodeString("\\U00010330\\U0010FF00\\u00A0"));
1942 }
1943 
TestQuantifier()1944 void TransliteratorTest::TestQuantifier() {
1945 
1946     // Make sure @ in a quantified anteContext works
1947     expect("a+ {b} > | @@ c; A > a; (a+ c) > '(' $1 ')';",
1948            "AAAAAb",
1949            "aaa(aac)");
1950 
1951     // Make sure @ in a quantified postContext works
1952     expect("{b} a+ > c @@ |; (a+) > '(' $1 ')';",
1953            "baaaaa",
1954            "caa(aaa)");
1955 
1956     // Make sure @ in a quantified postContext with seg ref works
1957     expect("{(b)} a+ > $1 @@ |; (a+) > '(' $1 ')';",
1958            "baaaaa",
1959            "baa(aaa)");
1960 
1961     // Make sure @ past ante context doesn't enter ante context
1962     UTransPosition pos = {0, 5, 3, 5};
1963     expect("a+ {b} > | @@ c; x > y; (a+ c) > '(' $1 ')';",
1964            "xxxab",
1965            "xxx(ac)",
1966            &pos);
1967 
1968     // Make sure @ past post context doesn't pass limit
1969     UTransPosition pos2 = {0, 4, 0, 2};
1970     expect("{b} a+ > c @@ |; x > y; a > A;",
1971            "baxx",
1972            "caxx",
1973            &pos2);
1974 
1975     // Make sure @ past post context doesn't enter post context
1976     expect("{b} a+ > c @@ |; x > y; a > A;",
1977            "baxx",
1978            "cayy");
1979 
1980     expect("(ab)? c > d;",
1981            "c abc ababc",
1982            "d d abd");
1983 
1984     // NOTE: The (ab)+ when referenced just yields a single "ab",
1985     // not the full sequence of them.  This accords with perl behavior.
1986     expect("(ab)+ {x} > '(' $1 ')';",
1987            "x abx ababxy",
1988            "x ab(ab) abab(ab)y");
1989 
1990     expect("b+ > x;",
1991            "ac abc abbc abbbc",
1992            "ac axc axc axc");
1993 
1994     expect("[abc]+ > x;",
1995            "qac abrc abbcs abtbbc",
1996            "qx xrx xs xtx");
1997 
1998     expect("q{(ab)+} > x;",
1999            "qa qab qaba qababc qaba",
2000            "qa qx qxa qxc qxa");
2001 
2002     expect("q(ab)* > x;",
2003            "qa qab qaba qababc",
2004            "xa x xa xc");
2005 
2006     // NOTE: The (ab)+ when referenced just yields a single "ab",
2007     // not the full sequence of them.  This accords with perl behavior.
2008     expect("q(ab)* > '(' $1 ')';",
2009            "qa qab qaba qababc",
2010            "()a (ab) (ab)a (ab)c");
2011 
2012     // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
2013     // quoted string
2014     expect("'ab'+ > x;",
2015            "bb ab ababb",
2016            "bb x xb");
2017 
2018     // $foo+ and $foo* -- the quantifier should apply to the entire
2019     // variable reference
2020     expect("$var = ab; $var+ > x;",
2021            "bb ab ababb",
2022            "bb x xb");
2023 }
2024 
2025 class TestTrans : public Transliterator {
2026 public:
TestTrans(const UnicodeString & id)2027     TestTrans(const UnicodeString& id) : Transliterator(id, 0) {
2028     }
clone() const2029     virtual TestTrans* clone() const override {
2030         return new TestTrans(getID());
2031     }
handleTransliterate(Replaceable &,UTransPosition & offsets,UBool) const2032     virtual void handleTransliterate(Replaceable& /*text*/, UTransPosition& offsets,
2033         UBool /*isIncremental*/) const override
2034     {
2035         offsets.start = offsets.limit;
2036     }
2037     virtual UClassID getDynamicClassID() const override;
2038     static UClassID U_EXPORT2 getStaticClassID();
2039 };
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)2040 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)
2041 
2042 /**
2043  * Test Source-Target/Variant.
2044  */
2045 void TransliteratorTest::TestSTV() {
2046     int32_t ns = Transliterator::countAvailableSources();
2047     logln((UnicodeString)"countAvailableSources at start: " + ns);
2048     if (ns < 0 || ns > 255) {
2049         errln((UnicodeString)"FAIL: Bad source count: " + ns);
2050         return;
2051     }
2052     int32_t i, j;
2053     for (i=0; i<ns; ++i) {
2054         UnicodeString source;
2055         Transliterator::getAvailableSource(i, source);
2056         logln((UnicodeString)"" + i + ": " + source);
2057         if (source.length() == 0) {
2058             errln("FAIL: empty source");
2059             continue;
2060         }
2061         int32_t nt = Transliterator::countAvailableTargets(source);
2062         if (nt < 0 || nt > 255) {
2063             errln((UnicodeString)"FAIL: Bad target count: " + nt);
2064             continue;
2065         }
2066         for (int32_t j=0; j<nt; ++j) {
2067             UnicodeString target;
2068             Transliterator::getAvailableTarget(j, source, target);
2069             logln((UnicodeString)" " + j + ": " + target);
2070             if (target.length() == 0) {
2071                 errln("FAIL: empty target");
2072                 continue;
2073             }
2074             int32_t nv = Transliterator::countAvailableVariants(source, target);
2075             if (nv < 0 || nv > 255) {
2076                 errln((UnicodeString)"FAIL: Bad variant count: " + nv);
2077                 continue;
2078             }
2079             for (int32_t k=0; k<nv; ++k) {
2080                 UnicodeString variant;
2081                 Transliterator::getAvailableVariant(k, source, target, variant);
2082                 if (variant.length() == 0) {
2083                     logln((UnicodeString)"  " + k + ": <empty>");
2084                 } else {
2085                     logln((UnicodeString)"  " + k + ": " + variant);
2086                 }
2087             }
2088         }
2089     }
2090 
2091     // Test registration
2092     const char* IDS[] = { "Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2093     const char* FULL_IDS[] = { "Any-Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2094     const char* SOURCES[] = { nullptr, "Seoridf", "Oewoir" };
2095     for (i=0; i<3; ++i) {
2096         Transliterator *t = new TestTrans(IDS[i]);
2097         if (t == 0) {
2098             errln("FAIL: out of memory");
2099             return;
2100         }
2101         if (t->getID() != IDS[i]) {
2102             errln((UnicodeString)"FAIL: ID mismatch for " + IDS[i]);
2103             delete t;
2104             return;
2105         }
2106         Transliterator::registerInstance(t);
2107         UErrorCode status = U_ZERO_ERROR;
2108         t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2109         if (t == nullptr) {
2110             errln((UnicodeString)"FAIL: Registration/creation failed for ID " +
2111                   IDS[i]);
2112         } else {
2113             logln((UnicodeString)"Ok: Registration/creation succeeded for ID " +
2114                   IDS[i]);
2115             delete t;
2116         }
2117         Transliterator::unregister(IDS[i]);
2118         t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2119         if (t != nullptr) {
2120             errln((UnicodeString)"FAIL: Unregistration failed for ID " +
2121                   IDS[i]);
2122             delete t;
2123         }
2124     }
2125 
2126     // Make sure getAvailable API reflects removal
2127     int32_t n = Transliterator::countAvailableIDs();
2128     logln((UnicodeString)"countAvailableIDs at end: " + n);
2129     for (i=0; i<n; ++i) {
2130         UnicodeString id = Transliterator::getAvailableID(i);
2131         for (j=0; j<3; ++j) {
2132             if (id.caseCompare(FULL_IDS[j],0)==0) {
2133                 errln((UnicodeString)"FAIL: unregister(" + id + ") failed");
2134             }
2135         }
2136     }
2137     n = Transliterator::countAvailableTargets("Any");
2138     logln((UnicodeString)"countAvailableTargets(\"Any\") at end: " + n);
2139     for (i=0; i<n; ++i) {
2140         UnicodeString t;
2141         Transliterator::getAvailableTarget(i, "Any", t);
2142         if (t.caseCompare(IDS[0],0)==0) {
2143             errln((UnicodeString)"FAIL: unregister(Any-" + t + ") failed");
2144         }
2145     }
2146     n = Transliterator::countAvailableSources();
2147     logln((UnicodeString)"countAvailableSources at end: " + n);
2148     for (i=0; i<n; ++i) {
2149         UnicodeString s;
2150         Transliterator::getAvailableSource(i, s);
2151         for (j=0; j<3; ++j) {
2152             if (SOURCES[j] == nullptr) continue;
2153             if (s.caseCompare(SOURCES[j],0)==0) {
2154                 if (j!=2 || !logKnownIssue("21911", "ICU4C cannot create inverse of (or unregister) Any-Xxxx/Variant transform created from both-direction transform")) {
2155                     errln((UnicodeString)"FAIL: unregister(" + s + "-*) failed");
2156                 }
2157             }
2158         }
2159     }
2160 }
2161 
2162 /**
2163  * Test inverse of Greek-Latin; Title()
2164  */
TestCompoundInverse()2165 void TransliteratorTest::TestCompoundInverse() {
2166     UParseError parseError;
2167     UErrorCode status = U_ZERO_ERROR;
2168     Transliterator *t = Transliterator::createInstance
2169         ("Greek-Latin; Title()", UTRANS_REVERSE,parseError, status);
2170     if (t == 0) {
2171         dataerrln("FAIL: createInstance - %s", u_errorName(status));
2172         return;
2173     }
2174     UnicodeString exp("(Title);Latin-Greek");
2175     if (t->getID() == exp) {
2176         logln("Ok: inverse of \"Greek-Latin; Title()\" is \"" +
2177               t->getID());
2178     } else {
2179         errln("FAIL: inverse of \"Greek-Latin; Title()\" is \"" +
2180               t->getID() + "\", expected \"" + exp + "\"");
2181     }
2182     delete t;
2183 }
2184 
2185 /**
2186  * Test NFD chaining with RBT
2187  */
TestNFDChainRBT()2188 void TransliteratorTest::TestNFDChainRBT() {
2189     UParseError pe;
2190     UErrorCode ec = U_ZERO_ERROR;
2191     Transliterator* t = Transliterator::createFromRules(
2192                                "TEST", "::NFD; aa > Q; a > q;",
2193                                UTRANS_FORWARD, pe, ec);
2194     if (t == nullptr || U_FAILURE(ec)) {
2195         dataerrln("FAIL: Transliterator::createFromRules failed with %s", u_errorName(ec));
2196         return;
2197     }
2198     expect(*t, "aa", "Q");
2199     delete t;
2200 
2201     // TEMPORARY TESTS -- BEING DEBUGGED
2202 //=-    UnicodeString s, s2;
2203 //=-    t = Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, pe, ec);
2204 //=-    s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2205 //=-    s2 = CharsToUnicodeString("\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D");
2206 //=-    expect(*t, s, s2);
2207 //=-    delete t;
2208 //=-
2209 //=-    t = Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2210 //=-    expect(*t, s2, s);
2211 //=-    delete t;
2212 //=-
2213 //=-    t = Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2214 //=-    s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2215 //=-    expect(*t, s, s);
2216 //=-    delete t;
2217 
2218 //    const char* source[] = {
2219 //        /*
2220 //        "\\u015Br\\u012Bmad",
2221 //        "bhagavadg\\u012Bt\\u0101",
2222 //        "adhy\\u0101ya",
2223 //        "arjuna",
2224 //        "vi\\u1E63\\u0101da",
2225 //        "y\\u014Dga",
2226 //        "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2227 //        "uv\\u0101cr\\u0325",
2228 //        */
2229 //        "rmk\\u1E63\\u0113t",
2230 //      //"dharmak\\u1E63\\u0113tr\\u0113",
2231 //        /*
2232 //        "kuruk\\u1E63\\u0113tr\\u0113",
2233 //        "samav\\u0113t\\u0101",
2234 //        "yuyutsava-\\u1E25",
2235 //        "m\\u0101mak\\u0101-\\u1E25",
2236 //     // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2237 //        "kimakurvata",
2238 //        "san\\u0304java",
2239 //        */
2240 //
2241 //        0
2242 //    };
2243 //    const char* expected[] = {
2244 //        /*
2245 //        "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2246 //        "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2247 //        "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2248 //        "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2249 //        "\\u0935\\u093f\\u0937\\u093e\\u0926",
2250 //        "\\u092f\\u094b\\u0917",
2251 //        "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2252 //        "\\u0909\\u0935\\u093E\\u091A\\u0943",
2253 //        */
2254 //        "\\u0927",
2255 //        //"\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2256 //        /*
2257 //        "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2258 //        "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2259 //        "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2260 //        "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2261 //    //  "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2262 //        "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2263 //        "\\u0938\\u0902\\u091c\\u0935",
2264 //        */
2265 //        0
2266 //    };
2267 //    UErrorCode status = U_ZERO_ERROR;
2268 //    UParseError parseError;
2269 //    UnicodeString message;
2270 //    Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2271 //    Transliterator* devToLatinToDev=Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2272 //    if(U_FAILURE(status)){
2273 //        errln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2274 //        errln("PreContext: " + prettify(parseError.preContext) + "PostContext: " + prettify( parseError.postContext) );
2275 //        delete latinToDevToLatin;
2276 //        delete devToLatinToDev;
2277 //        return;
2278 //    }
2279 //    UnicodeString gotResult;
2280 //    for(int i= 0; source[i] != 0; i++){
2281 //        gotResult = source[i];
2282 //        expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2283 //        expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2284 //    }
2285 //    delete latinToDevToLatin;
2286 //    delete devToLatinToDev;
2287 }
2288 
2289 /**
2290  * Inverse of "Null" should be "Null". (J21)
2291  */
TestNullInverse()2292 void TransliteratorTest::TestNullInverse() {
2293     UParseError pe;
2294     UErrorCode ec = U_ZERO_ERROR;
2295     Transliterator *t = Transliterator::createInstance("Null", UTRANS_FORWARD, pe, ec);
2296     if (t == 0 || U_FAILURE(ec)) {
2297         errln("FAIL: createInstance");
2298         return;
2299     }
2300     Transliterator *u = t->createInverse(ec);
2301     if (u == 0 || U_FAILURE(ec)) {
2302         errln("FAIL: createInverse");
2303         delete t;
2304         return;
2305     }
2306     if (u->getID() != "Null") {
2307         errln("FAIL: Inverse of Null should be Null");
2308     }
2309     delete t;
2310     delete u;
2311 }
2312 
2313 /**
2314  * Check ID of inverse of alias. (J22)
2315  */
TestAliasInverseID()2316 void TransliteratorTest::TestAliasInverseID() {
2317     UnicodeString ID("Latin-Hangul", ""); // This should be any alias ID with an inverse
2318     UParseError pe;
2319     UErrorCode ec = U_ZERO_ERROR;
2320     Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2321     if (t == 0 || U_FAILURE(ec)) {
2322         dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2323         return;
2324     }
2325     Transliterator *u = t->createInverse(ec);
2326     if (u == 0 || U_FAILURE(ec)) {
2327         errln("FAIL: createInverse");
2328         delete t;
2329         return;
2330     }
2331     UnicodeString exp = "Hangul-Latin";
2332     UnicodeString got = u->getID();
2333     if (got != exp) {
2334         errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2335               ", expected " + exp);
2336     }
2337     delete t;
2338     delete u;
2339 }
2340 
2341 /**
2342  * Test IDs of inverses of compound transliterators. (J20)
2343  */
TestCompoundInverseID()2344 void TransliteratorTest::TestCompoundInverseID() {
2345     UnicodeString ID = "Latin-Jamo;NFC(NFD)";
2346     UParseError pe;
2347     UErrorCode ec = U_ZERO_ERROR;
2348     Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2349     if (t == 0 || U_FAILURE(ec)) {
2350         dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2351         return;
2352     }
2353     Transliterator *u = t->createInverse(ec);
2354     if (u == 0 || U_FAILURE(ec)) {
2355         errln("FAIL: createInverse");
2356         delete t;
2357         return;
2358     }
2359     UnicodeString exp = "NFD(NFC);Jamo-Latin";
2360     UnicodeString got = u->getID();
2361     if (got != exp) {
2362         errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2363               ", expected " + exp);
2364     }
2365     delete t;
2366     delete u;
2367 }
2368 
2369 /**
2370  * Test undefined variable.
2371 
2372  */
TestUndefinedVariable()2373 void TransliteratorTest::TestUndefinedVariable() {
2374     UnicodeString rule = "$initial } a <> \\u1161;";
2375     UParseError pe;
2376     UErrorCode ec = U_ZERO_ERROR;
2377     Transliterator *t = Transliterator::createFromRules("<ID>", rule, UTRANS_FORWARD, pe, ec);
2378     delete t;
2379     if (U_FAILURE(ec)) {
2380         logln((UnicodeString)"OK: Got exception for " + rule + ", as expected: " +
2381               u_errorName(ec));
2382         return;
2383     }
2384     errln((UnicodeString)"Fail: bogus rule " + rule + " compiled with error " +
2385           u_errorName(ec));
2386 }
2387 
2388 /**
2389  * Test empty context.
2390  */
TestEmptyContext()2391 void TransliteratorTest::TestEmptyContext() {
2392     expect(" { a } > b;", "xay a ", "xby b ");
2393 }
2394 
2395 /**
2396 * Test compound filter ID syntax
2397 */
TestCompoundFilterID()2398 void TransliteratorTest::TestCompoundFilterID() {
2399     static const char* DATA[] = {
2400         // Col. 1 = ID or rule set (latter must start with #)
2401 
2402         // = columns > 1 are null if expect col. 1 to be illegal =
2403 
2404         // Col. 2 = direction, "F..." or "R..."
2405         // Col. 3 = source string
2406         // Col. 4 = exp result
2407 
2408         "[abc]; [abc]", nullptr, nullptr, nullptr, // multiple filters
2409         "Latin-Greek; [abc];", nullptr, nullptr, nullptr, // misplaced filter
2410         "[b]; Latin-Greek; Upper; ([xyz])", "F", "abc", "a\\u0392c",
2411         "[b]; (Lower); Latin-Greek; Upper(); ([\\u0392])", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2412         "#\n::[b]; ::Latin-Greek; ::Upper; ::([xyz]);", "F", "abc", "a\\u0392c",
2413         "#\n::[b]; ::(Lower); ::Latin-Greek; ::Upper(); ::([\\u0392]);", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2414         nullptr,
2415     };
2416 
2417     for (int32_t i=0; DATA[i]; i+=4) {
2418         UnicodeString id = CharsToUnicodeString(DATA[i]);
2419         UTransDirection direction = (DATA[i+1] != nullptr && DATA[i+1][0] == 'R') ?
2420             UTRANS_REVERSE : UTRANS_FORWARD;
2421         UnicodeString source;
2422         UnicodeString exp;
2423         if (DATA[i+2] != nullptr) {
2424             source = CharsToUnicodeString(DATA[i+2]);
2425             exp = CharsToUnicodeString(DATA[i+3]);
2426         }
2427         UBool expOk = (DATA[i+1] != nullptr);
2428         LocalPointer<Transliterator> t;
2429         UParseError pe;
2430         UErrorCode ec = U_ZERO_ERROR;
2431         if (id.charAt(0) == 0x23/*#*/) {
2432             t.adoptInstead(Transliterator::createFromRules("ID", id, direction, pe, ec));
2433         } else {
2434             t.adoptInstead(Transliterator::createInstance(id, direction, pe, ec));
2435         }
2436         UBool ok = (t.isValid() && U_SUCCESS(ec));
2437         UnicodeString transID;
2438         if (t.isValid()) {
2439             transID = t->getID();
2440         }
2441         else {
2442             transID = UnicodeString("nullptr", "");
2443         }
2444         if (ok == expOk) {
2445             logln((UnicodeString)"Ok: " + id + " => " + transID + ", " +
2446                   u_errorName(ec));
2447             if (source.length() != 0) {
2448                 expect(*t, source, exp);
2449             }
2450         } else {
2451             dataerrln((UnicodeString)"FAIL: " + id + " => " + transID + ", " +
2452                   u_errorName(ec));
2453         }
2454     }
2455 }
2456 
2457 /**
2458  * Test new property set syntax
2459  */
TestPropertySet()2460 void TransliteratorTest::TestPropertySet() {
2461     expect(UNICODE_STRING_SIMPLE("a>A; \\p{Lu}>x; \\p{ANY}>y;"), "abcDEF", "Ayyxxx");
2462     expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
2463            "[ a stitch ]\n[ in time ]\r[ saves 9]");
2464 }
2465 
2466 /**
2467  * Test various failure points of the new 2.0 engine.
2468  */
TestNewEngine()2469 void TransliteratorTest::TestNewEngine() {
2470     UParseError pe;
2471     UErrorCode ec = U_ZERO_ERROR;
2472     Transliterator *t = Transliterator::createInstance("Latin-Hiragana", UTRANS_FORWARD, pe, ec);
2473     if (t == 0 || U_FAILURE(ec)) {
2474         dataerrln("FAIL: createInstance Latin-Hiragana - %s", u_errorName(ec));
2475         return;
2476     }
2477     // Katakana should be untouched
2478     expect(*t, CharsToUnicodeString("a\\u3042\\u30A2"),
2479            CharsToUnicodeString("\\u3042\\u3042\\u30A2"));
2480 
2481     delete t;
2482 
2483 #if 1
2484     // This test will only work if Transliterator.ROLLBACK is
2485     // true.  Otherwise, this test will fail, revealing a
2486     // limitation of global filters in incremental mode.
2487     Transliterator *a =
2488         Transliterator::createFromRules("a_to_A", "a > A;", UTRANS_FORWARD, pe, ec);
2489     Transliterator *A =
2490         Transliterator::createFromRules("A_to_b", "A > b;", UTRANS_FORWARD, pe, ec);
2491     if (U_FAILURE(ec)) {
2492         delete a;
2493         delete A;
2494         return;
2495     }
2496 
2497     Transliterator* array[3];
2498     array[0] = a;
2499     array[1] = Transliterator::createInstance("NFD", UTRANS_FORWARD, pe, ec);
2500     array[2] = A;
2501     if (U_FAILURE(ec)) {
2502         errln("FAIL: createInstance NFD");
2503         delete a;
2504         delete A;
2505         delete array[1];
2506         return;
2507     }
2508 
2509     t = new CompoundTransliterator(array, 3, new UnicodeSet("[:Ll:]", ec));
2510     if (U_FAILURE(ec)) {
2511         errln("FAIL: UnicodeSet constructor");
2512         delete a;
2513         delete A;
2514         delete array[1];
2515         delete t;
2516         return;
2517     }
2518 
2519     expect(*t, "aAaA", "bAbA");
2520 
2521     assertTrue("countElements", t->countElements() == 3);
2522     assertEquals("getElement(0)", t->getElement(0, ec).getID(), "a_to_A");
2523     assertEquals("getElement(1)", t->getElement(1, ec).getID(), "NFD");
2524     assertEquals("getElement(2)", t->getElement(2, ec).getID(), "A_to_b");
2525     assertSuccess("getElement", ec);
2526 
2527     delete a;
2528     delete A;
2529     delete array[1];
2530     delete t;
2531 #endif
2532 
2533     expect("$smooth = x; $macron = q; [:^L:] { ([aeiouyAEIOUY] $macron?) } [^aeiouyAEIOUY$smooth$macron] > | $1 $smooth ;",
2534            "a",
2535            "ax");
2536 
2537     UnicodeString gr = CharsToUnicodeString(
2538         "$ddot = \\u0308 ;"
2539         "$lcgvowel = [\\u03b1\\u03b5\\u03b7\\u03b9\\u03bf\\u03c5\\u03c9] ;"
2540         "$rough = \\u0314 ;"
2541         "($lcgvowel+ $ddot?) $rough > h | $1 ;"
2542         "\\u03b1 <> a ;"
2543         "$rough <> h ;");
2544 
2545     expect(gr, CharsToUnicodeString("\\u03B1\\u0314"), "ha");
2546 }
2547 
2548 /**
2549  * Test quantified segment behavior.  We want:
2550  * ([abc])+ > x $1 x; applied to "cba" produces "xax"
2551  */
TestQuantifiedSegment()2552 void TransliteratorTest::TestQuantifiedSegment() {
2553     // The normal case
2554     expect("([abc]+) > x $1 x;", "cba", "xcbax");
2555 
2556     // The tricky case; the quantifier is around the segment
2557     expect("([abc])+ > x $1 x;", "cba", "xax");
2558 
2559     // Tricky case in reverse direction
2560     expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
2561 
2562     // Check post-context segment
2563     expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
2564 
2565     // Test toRule/toPattern for non-quantified segment.
2566     // Careful with spacing here.
2567     UnicodeString r("([a-c]){q} > x $1 x;");
2568     UParseError pe;
2569     UErrorCode ec = U_ZERO_ERROR;
2570     Transliterator* t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2571     if (U_FAILURE(ec)) {
2572         errln("FAIL: createFromRules");
2573         delete t;
2574         return;
2575     }
2576     UnicodeString rr;
2577     t->toRules(rr, true);
2578     if (r != rr) {
2579         errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2580     } else {
2581         logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2582     }
2583     delete t;
2584 
2585     // Test toRule/toPattern for quantified segment.
2586     // Careful with spacing here.
2587     r = "([a-c])+{q} > x $1 x;";
2588     t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2589     if (U_FAILURE(ec)) {
2590         errln("FAIL: createFromRules");
2591         delete t;
2592         return;
2593     }
2594     t->toRules(rr, true);
2595     if (r != rr) {
2596         errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2597     } else {
2598         logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2599     }
2600     delete t;
2601 }
2602 
2603 //======================================================================
2604 // Ram's tests
2605 //======================================================================
TestDevanagariLatinRT()2606 void TransliteratorTest::TestDevanagariLatinRT(){
2607     const int MAX_LEN= 52;
2608     const char* const source[MAX_LEN] = {
2609         "bh\\u0101rata",
2610         "kra",
2611         "k\\u1E63a",
2612         "khra",
2613         "gra",
2614         "\\u1E45ra",
2615         "cra",
2616         "chra",
2617         "j\\u00F1a",
2618         "jhra",
2619         "\\u00F1ra",
2620         "\\u1E6Dya",
2621         "\\u1E6Dhra",
2622         "\\u1E0Dya",
2623       //"r\\u0323ya", // \u095c is not valid in Devanagari
2624         "\\u1E0Dhya",
2625         "\\u1E5Bhra",
2626         "\\u1E47ra",
2627         "tta",
2628         "thra",
2629         "dda",
2630         "dhra",
2631         "nna",
2632         "pra",
2633         "phra",
2634         "bra",
2635         "bhra",
2636         "mra",
2637         "\\u1E49ra",
2638       //"l\\u0331ra",
2639         "yra",
2640         "\\u1E8Fra",
2641       //"l-",
2642         "vra",
2643         "\\u015Bra",
2644         "\\u1E63ra",
2645         "sra",
2646         "hma",
2647         "\\u1E6D\\u1E6Da",
2648         "\\u1E6D\\u1E6Dha",
2649         "\\u1E6Dh\\u1E6Dha",
2650         "\\u1E0D\\u1E0Da",
2651         "\\u1E0D\\u1E0Dha",
2652         "\\u1E6Dya",
2653         "\\u1E6Dhya",
2654         "\\u1E0Dya",
2655         "\\u1E0Dhya",
2656         // Not roundtrippable --
2657         // \\u0939\\u094d\\u094d\\u092E  - hma
2658         // \\u0939\\u094d\\u092E         - hma
2659         // CharsToUnicodeString("hma"),
2660         "hya",
2661         "\\u015Br\\u0325",
2662         "\\u015Bca",
2663         "\\u0115",
2664         "san\\u0304j\\u012Bb s\\u0113nagupta",
2665         "\\u0101nand vaddir\\u0101ju",
2666         "\\u0101",
2667         "a"
2668     };
2669     const char* const expected[MAX_LEN] = {
2670         "\\u092D\\u093E\\u0930\\u0924",   /* bha\\u0304rata */
2671         "\\u0915\\u094D\\u0930",          /* kra         */
2672         "\\u0915\\u094D\\u0937",          /* ks\\u0323a  */
2673         "\\u0916\\u094D\\u0930",          /* khra        */
2674         "\\u0917\\u094D\\u0930",          /* gra         */
2675         "\\u0919\\u094D\\u0930",          /* n\\u0307ra  */
2676         "\\u091A\\u094D\\u0930",          /* cra         */
2677         "\\u091B\\u094D\\u0930",          /* chra        */
2678         "\\u091C\\u094D\\u091E",          /* jn\\u0303a  */
2679         "\\u091D\\u094D\\u0930",          /* jhra        */
2680         "\\u091E\\u094D\\u0930",          /* n\\u0303ra  */
2681         "\\u091F\\u094D\\u092F",          /* t\\u0323ya  */
2682         "\\u0920\\u094D\\u0930",          /* t\\u0323hra */
2683         "\\u0921\\u094D\\u092F",          /* d\\u0323ya  */
2684       //"\\u095C\\u094D\\u092F",        /* r\\u0323ya  */ // \u095c is not valid in Devanagari
2685         "\\u0922\\u094D\\u092F",          /* d\\u0323hya */
2686         "\\u0922\\u093C\\u094D\\u0930",   /* r\\u0323hra */
2687         "\\u0923\\u094D\\u0930",          /* n\\u0323ra  */
2688         "\\u0924\\u094D\\u0924",          /* tta         */
2689         "\\u0925\\u094D\\u0930",          /* thra        */
2690         "\\u0926\\u094D\\u0926",          /* dda         */
2691         "\\u0927\\u094D\\u0930",          /* dhra        */
2692         "\\u0928\\u094D\\u0928",          /* nna         */
2693         "\\u092A\\u094D\\u0930",          /* pra         */
2694         "\\u092B\\u094D\\u0930",          /* phra        */
2695         "\\u092C\\u094D\\u0930",          /* bra         */
2696         "\\u092D\\u094D\\u0930",          /* bhra        */
2697         "\\u092E\\u094D\\u0930",          /* mra         */
2698         "\\u0929\\u094D\\u0930",          /* n\\u0331ra  */
2699       //"\\u0934\\u094D\\u0930",        /* l\\u0331ra  */
2700         "\\u092F\\u094D\\u0930",          /* yra         */
2701         "\\u092F\\u093C\\u094D\\u0930",   /* y\\u0307ra  */
2702       //"l-",
2703         "\\u0935\\u094D\\u0930",          /* vra         */
2704         "\\u0936\\u094D\\u0930",          /* s\\u0301ra  */
2705         "\\u0937\\u094D\\u0930",          /* s\\u0323ra  */
2706         "\\u0938\\u094D\\u0930",          /* sra         */
2707         "\\u0939\\u094d\\u092E",          /* hma         */
2708         "\\u091F\\u094D\\u091F",          /* t\\u0323t\\u0323a  */
2709         "\\u091F\\u094D\\u0920",          /* t\\u0323t\\u0323ha */
2710         "\\u0920\\u094D\\u0920",          /* t\\u0323ht\\u0323ha*/
2711         "\\u0921\\u094D\\u0921",          /* d\\u0323d\\u0323a  */
2712         "\\u0921\\u094D\\u0922",          /* d\\u0323d\\u0323ha */
2713         "\\u091F\\u094D\\u092F",          /* t\\u0323ya  */
2714         "\\u0920\\u094D\\u092F",          /* t\\u0323hya */
2715         "\\u0921\\u094D\\u092F",          /* d\\u0323ya  */
2716         "\\u0922\\u094D\\u092F",          /* d\\u0323hya */
2717      // "hma",                         /* hma         */
2718         "\\u0939\\u094D\\u092F",          /* hya         */
2719         "\\u0936\\u0943",                 /* s\\u0301r\\u0325a  */
2720         "\\u0936\\u094D\\u091A",          /* s\\u0301ca  */
2721         "\\u090d",                        /* e\\u0306    */
2722         "\\u0938\\u0902\\u091C\\u0940\\u092C\\u094D \\u0938\\u0947\\u0928\\u0917\\u0941\\u092A\\u094D\\u0924",
2723         "\\u0906\\u0928\\u0902\\u0926\\u094D \\u0935\\u0926\\u094D\\u0926\\u093F\\u0930\\u093E\\u091C\\u0941",
2724         "\\u0906",
2725         "\\u0905",
2726     };
2727     UErrorCode status = U_ZERO_ERROR;
2728     UParseError parseError;
2729     UnicodeString message;
2730     Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2731     Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2732     if(U_FAILURE(status)){
2733         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2734         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2735         return;
2736     }
2737     UnicodeString gotResult;
2738     for(int i= 0; i<MAX_LEN; i++){
2739         gotResult = source[i];
2740         expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2741         expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2742     }
2743     delete latinToDev;
2744     delete devToLatin;
2745 }
2746 
TestTeluguLatinRT()2747 void TransliteratorTest::TestTeluguLatinRT(){
2748     const int MAX_LEN=10;
2749     const char* const source[MAX_LEN] = {
2750         "raghur\\u0101m vi\\u015Bvan\\u0101dha",                         /* Raghuram Viswanadha    */
2751         "\\u0101nand vaddir\\u0101ju",                                   /* Anand Vaddiraju        */
2752         "r\\u0101j\\u012Bv ka\\u015Barab\\u0101da",                      /* Rajeev Kasarabada      */
2753         "san\\u0304j\\u012Bv ka\\u015Barab\\u0101da",                    /* sanjeev kasarabada     */
2754         "san\\u0304j\\u012Bb sen'gupta",                                 /* sanjib sengupata       */
2755         "amar\\u0113ndra hanum\\u0101nula",                              /* Amarendra hanumanula   */
2756         "ravi kum\\u0101r vi\\u015Bvan\\u0101dha",                       /* Ravi Kumar Viswanadha  */
2757         "\\u0101ditya kandr\\u0113gula",                                 /* Aditya Kandregula      */
2758         "\\u015Br\\u012Bdhar ka\\u1E47\\u1E6Dama\\u015Be\\u1E6D\\u1E6Di",/* Shridhar Kantamsetty   */
2759         "m\\u0101dhav de\\u015Be\\u1E6D\\u1E6Di"                         /* Madhav Desetty         */
2760     };
2761 
2762     const char* const expected[MAX_LEN] = {
2763         "\\u0c30\\u0c18\\u0c41\\u0c30\\u0c3e\\u0c2e\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2764         "\\u0c06\\u0c28\\u0c02\\u0c26\\u0c4d \\u0C35\\u0C26\\u0C4D\\u0C26\\u0C3F\\u0C30\\u0C3E\\u0C1C\\u0C41",
2765         "\\u0c30\\u0c3e\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2766         "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2767         "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c2c\\u0c4d \\u0c38\\u0c46\\u0c28\\u0c4d\\u0c17\\u0c41\\u0c2a\\u0c4d\\u0c24",
2768         "\\u0c05\\u0c2e\\u0c30\\u0c47\\u0c02\\u0c26\\u0c4d\\u0c30 \\u0c39\\u0c28\\u0c41\\u0c2e\\u0c3e\\u0c28\\u0c41\\u0c32",
2769         "\\u0c30\\u0c35\\u0c3f \\u0c15\\u0c41\\u0c2e\\u0c3e\\u0c30\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2770         "\\u0c06\\u0c26\\u0c3f\\u0c24\\u0c4d\\u0c2f \\u0C15\\u0C02\\u0C26\\u0C4D\\u0C30\\u0C47\\u0C17\\u0C41\\u0c32",
2771         "\\u0c36\\u0c4d\\u0c30\\u0c40\\u0C27\\u0C30\\u0C4D \\u0c15\\u0c02\\u0c1f\\u0c2e\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2772         "\\u0c2e\\u0c3e\\u0c27\\u0c35\\u0c4d \\u0c26\\u0c46\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2773     };
2774 
2775     UErrorCode status = U_ZERO_ERROR;
2776     UParseError parseError;
2777     UnicodeString message;
2778     Transliterator* latinToDev=Transliterator::createInstance("Latin-Telugu", UTRANS_FORWARD, parseError, status);
2779     Transliterator* devToLatin=Transliterator::createInstance("Telugu-Latin", UTRANS_FORWARD, parseError, status);
2780     if(U_FAILURE(status)){
2781         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2782         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2783         return;
2784     }
2785     UnicodeString gotResult;
2786     for(int i= 0; i<MAX_LEN; i++){
2787         gotResult = source[i];
2788         expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2789         expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2790     }
2791     delete latinToDev;
2792     delete devToLatin;
2793 }
2794 
TestSanskritLatinRT()2795 void TransliteratorTest::TestSanskritLatinRT(){
2796     const int MAX_LEN =16;
2797     const char* const source[MAX_LEN] = {
2798         "rmk\\u1E63\\u0113t",
2799         "\\u015Br\\u012Bmad",
2800         "bhagavadg\\u012Bt\\u0101",
2801         "adhy\\u0101ya",
2802         "arjuna",
2803         "vi\\u1E63\\u0101da",
2804         "y\\u014Dga",
2805         "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2806         "uv\\u0101cr\\u0325",
2807         "dharmak\\u1E63\\u0113tr\\u0113",
2808         "kuruk\\u1E63\\u0113tr\\u0113",
2809         "samav\\u0113t\\u0101",
2810         "yuyutsava\\u1E25",
2811         "m\\u0101mak\\u0101\\u1E25",
2812     // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2813         "kimakurvata",
2814         "san\\u0304java",
2815     };
2816     const char* const expected[MAX_LEN] = {
2817         "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2818         "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2819         "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2820         "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2821         "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2822         "\\u0935\\u093f\\u0937\\u093e\\u0926",
2823         "\\u092f\\u094b\\u0917",
2824         "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2825         "\\u0909\\u0935\\u093E\\u091A\\u0943",
2826         "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2827         "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2828         "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2829         "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2830         "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2831     //"\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2832         "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2833         "\\u0938\\u0902\\u091c\\u0935",
2834     };
2835     UErrorCode status = U_ZERO_ERROR;
2836     UParseError parseError;
2837     UnicodeString message;
2838     Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2839     Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2840     if(U_FAILURE(status)){
2841         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2842         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2843         return;
2844     }
2845     UnicodeString gotResult;
2846     for(int i= 0; i<MAX_LEN; i++){
2847         gotResult = source[i];
2848         expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2849         expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2850     }
2851     delete latinToDev;
2852     delete devToLatin;
2853 }
2854 
2855 
TestCompoundLatinRT()2856 void TransliteratorTest::TestCompoundLatinRT(){
2857     const char* const source[] = {
2858         "rmk\\u1E63\\u0113t",
2859         "\\u015Br\\u012Bmad",
2860         "bhagavadg\\u012Bt\\u0101",
2861         "adhy\\u0101ya",
2862         "arjuna",
2863         "vi\\u1E63\\u0101da",
2864         "y\\u014Dga",
2865         "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2866         "uv\\u0101cr\\u0325",
2867         "dharmak\\u1E63\\u0113tr\\u0113",
2868         "kuruk\\u1E63\\u0113tr\\u0113",
2869         "samav\\u0113t\\u0101",
2870         "yuyutsava\\u1E25",
2871         "m\\u0101mak\\u0101\\u1E25",
2872      // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2873         "kimakurvata",
2874         "san\\u0304java"
2875     };
2876     const int MAX_LEN = UPRV_LENGTHOF(source);
2877     const char* const expected[MAX_LEN] = {
2878         "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2879         "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2880         "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2881         "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2882         "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2883         "\\u0935\\u093f\\u0937\\u093e\\u0926",
2884         "\\u092f\\u094b\\u0917",
2885         "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2886         "\\u0909\\u0935\\u093E\\u091A\\u0943",
2887         "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2888         "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2889         "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2890         "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2891         "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2892     //  "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2893         "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2894         "\\u0938\\u0902\\u091c\\u0935"
2895     };
2896     if(MAX_LEN != UPRV_LENGTHOF(expected)) {
2897         errln("error in TestCompoundLatinRT: source[] and expected[] have different lengths!");
2898         return;
2899     }
2900 
2901     UErrorCode status = U_ZERO_ERROR;
2902     UParseError parseError;
2903     UnicodeString message;
2904     Transliterator* devToLatinToDev  =Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2905     Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2906     Transliterator* devToTelToDev    =Transliterator::createInstance("Devanagari-Telugu;Telugu-Devanagari", UTRANS_FORWARD, parseError, status);
2907     Transliterator* latinToTelToLatin=Transliterator::createInstance("Latin-Telugu;Telugu-Latin", UTRANS_FORWARD, parseError, status);
2908 
2909     if(U_FAILURE(status)){
2910         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2911         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2912         return;
2913     }
2914     UnicodeString gotResult;
2915     for(int i= 0; i<MAX_LEN; i++){
2916         gotResult = source[i];
2917         expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2918         expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2919         expect(*latinToTelToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2920 
2921     }
2922     delete(latinToDevToLatin);
2923     delete(devToLatinToDev);
2924     delete(devToTelToDev);
2925     delete(latinToTelToLatin);
2926 }
2927 
2928 /**
2929  * Test Gurmukhi-Devanagari Tippi and Bindi
2930  */
TestGurmukhiDevanagari()2931 void TransliteratorTest::TestGurmukhiDevanagari(){
2932     // the rule says:
2933     // (\u0902) (when preceded by vowel)      --->  (\u0A02)
2934     // (\u0902) (when preceded by consonant)  --->  (\u0A70)
2935     UErrorCode status = U_ZERO_ERROR;
2936     UnicodeSet vowel(UnicodeString("[\\u0905-\\u090A \\u090F\\u0910\\u0913\\u0914 \\u093e-\\u0942\\u0947\\u0948\\u094B\\u094C\\u094D]", -1, US_INV).unescape(), status);
2937     UnicodeSet non_vowel(UnicodeString("[\\u0915-\\u0928\\u092A-\\u0930]", -1, US_INV).unescape(), status);
2938     UParseError parseError;
2939 
2940     UnicodeSetIterator vIter(vowel);
2941     UnicodeSetIterator nvIter(non_vowel);
2942     Transliterator* trans = Transliterator::createInstance("Devanagari-Gurmukhi",UTRANS_FORWARD, parseError, status);
2943     if(U_FAILURE(status)) {
2944       dataerrln("Error creating transliterator %s", u_errorName(status));
2945       delete trans;
2946       return;
2947     }
2948     UnicodeString src (" \\u0902", -1, US_INV);
2949     UnicodeString expected(" \\u0A02", -1, US_INV);
2950     src = src.unescape();
2951     expected= expected.unescape();
2952 
2953     while(vIter.next()){
2954         src.setCharAt(0,(char16_t) vIter.getCodepoint());
2955         expected.setCharAt(0,(char16_t) (vIter.getCodepoint()+0x0100));
2956         expect(*trans,src,expected);
2957     }
2958 
2959     expected.setCharAt(1,0x0A70);
2960     while(nvIter.next()){
2961         //src.setCharAt(0,(char) nvIter.codepoint);
2962         src.setCharAt(0,(char16_t)nvIter.getCodepoint());
2963         expected.setCharAt(0,(char16_t) (nvIter.getCodepoint()+0x0100));
2964         expect(*trans,src,expected);
2965     }
2966     delete trans;
2967 }
2968 /**
2969  * Test instantiation from a locale.
2970  */
TestLocaleInstantiation()2971 void TransliteratorTest::TestLocaleInstantiation() {
2972     UParseError pe;
2973     UErrorCode ec = U_ZERO_ERROR;
2974     Transliterator *t = Transliterator::createInstance("ru_RU-Latin", UTRANS_FORWARD, pe, ec);
2975     if (U_FAILURE(ec)) {
2976         dataerrln("FAIL: createInstance(ru_RU-Latin) - %s", u_errorName(ec));
2977         delete t;
2978         return;
2979     }
2980     expect(*t, CharsToUnicodeString("\\u0430"), "a");
2981     delete t;
2982 
2983     t = Transliterator::createInstance("en-el", UTRANS_FORWARD, pe, ec);
2984     if (U_FAILURE(ec)) {
2985         errln("FAIL: createInstance(en-el)");
2986         delete t;
2987         return;
2988     }
2989     expect(*t, "a", CharsToUnicodeString("\\u03B1"));
2990     delete t;
2991 }
2992 
2993 /**
2994  * Test title case handling of accent (should ignore accents)
2995  */
TestTitleAccents()2996 void TransliteratorTest::TestTitleAccents() {
2997     UParseError pe;
2998     UErrorCode ec = U_ZERO_ERROR;
2999     Transliterator *t = Transliterator::createInstance("Title", UTRANS_FORWARD, pe, ec);
3000     if (U_FAILURE(ec)) {
3001         errln("FAIL: createInstance(Title)");
3002         delete t;
3003         return;
3004     }
3005     expect(*t, CharsToUnicodeString("a\\u0300b can't abe"), CharsToUnicodeString("A\\u0300b Can't Abe"));
3006     delete t;
3007 }
3008 
3009 /**
3010  * Basic test of a locale resource based rule.
3011  */
TestLocaleResource()3012 void TransliteratorTest::TestLocaleResource() {
3013     const char* DATA[] = {
3014         // id                    from               to
3015         //"Latin-Greek/UNGEGN",    "b",               "\\u03bc\\u03c0",
3016         "Latin-el",              "b",               "\\u03bc\\u03c0",
3017         "Latin-Greek",           "b",               "\\u03B2",
3018         "Greek-Latin/UNGEGN",    "\\u03B2",         "v",
3019         "el-Latin",              "\\u03B2",         "v",
3020         "Greek-Latin",           "\\u03B2",         "b",
3021     };
3022     const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3023     for (int32_t i=0; i<DATA_length; i+=3) {
3024         UParseError pe;
3025         UErrorCode ec = U_ZERO_ERROR;
3026         Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, pe, ec);
3027         if (U_FAILURE(ec)) {
3028             dataerrln((UnicodeString)"FAIL: createInstance(" + DATA[i] + ") - " + u_errorName(ec));
3029             delete t;
3030             continue;
3031         }
3032         expect(*t, CharsToUnicodeString(DATA[i+1]),
3033                CharsToUnicodeString(DATA[i+2]));
3034         delete t;
3035     }
3036 }
3037 
3038 /**
3039  * Make sure parse errors reference the right line.
3040  */
TestParseError()3041 void TransliteratorTest::TestParseError() {
3042     static const char* rule =
3043         "a > b;\n"
3044         "# more stuff\n"
3045         "d << b;";
3046     UErrorCode ec = U_ZERO_ERROR;
3047     UParseError pe;
3048     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3049     delete t;
3050     if (U_FAILURE(ec)) {
3051         UnicodeString err(pe.preContext);
3052         err.append((char16_t)124/*|*/).append(pe.postContext);
3053         if (err.indexOf("d << b") >= 0) {
3054             logln("Ok: " + err);
3055         } else {
3056             errln("FAIL: " + err);
3057         }
3058     }
3059     else {
3060         errln("FAIL: no syntax error");
3061     }
3062     static const char* maskingRule =
3063         "a>x;\n"
3064         "# more stuff\n"
3065         "ab>y;";
3066     ec = U_ZERO_ERROR;
3067     delete Transliterator::createFromRules("ID", maskingRule, UTRANS_FORWARD, pe, ec);
3068     if (ec != U_RULE_MASK_ERROR) {
3069         errln("FAIL: returned %s instead of U_RULE_MASK_ERROR", u_errorName(ec));
3070     }
3071     else if (UnicodeString("a > x;") != UnicodeString(pe.preContext)) {
3072         errln("FAIL: did not get expected precontext");
3073     }
3074     else if (UnicodeString("ab > y;") != UnicodeString(pe.postContext)) {
3075         errln("FAIL: did not get expected postcontext");
3076     }
3077 }
3078 
3079 /**
3080  * Make sure sets on output are disallowed.
3081  */
TestOutputSet()3082 void TransliteratorTest::TestOutputSet() {
3083     UnicodeString rule = "$set = [a-cm-n]; b > $set;";
3084     UErrorCode ec = U_ZERO_ERROR;
3085     UParseError pe;
3086     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3087     delete t;
3088     if (U_FAILURE(ec)) {
3089         UnicodeString err(pe.preContext);
3090         err.append((char16_t)124/*|*/).append(pe.postContext);
3091         logln("Ok: " + err);
3092         return;
3093     }
3094     errln("FAIL: No syntax error");
3095 }
3096 
3097 /**
3098  * Test the use variable range pragma, making sure that use of
3099  * variable range characters is detected and flagged as an error.
3100  */
TestVariableRange()3101 void TransliteratorTest::TestVariableRange() {
3102     UnicodeString rule = "use variable range 0x70 0x72; a > A; b > B; q > Q;";
3103     UErrorCode ec = U_ZERO_ERROR;
3104     UParseError pe;
3105     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3106     delete t;
3107     if (U_FAILURE(ec)) {
3108         UnicodeString err(pe.preContext);
3109         err.append((char16_t)124/*|*/).append(pe.postContext);
3110         logln("Ok: " + err);
3111         return;
3112     }
3113     errln("FAIL: No syntax error");
3114 }
3115 
3116 /**
3117  * Test invalid post context error handling
3118  */
TestInvalidPostContext()3119 void TransliteratorTest::TestInvalidPostContext() {
3120     UnicodeString rule = "a}b{c>d;";
3121     UErrorCode ec = U_ZERO_ERROR;
3122     UParseError pe;
3123     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3124     delete t;
3125     if (U_FAILURE(ec)) {
3126         UnicodeString err(pe.preContext);
3127         err.append((char16_t)124/*|*/).append(pe.postContext);
3128         if (err.indexOf("a}b{c") >= 0) {
3129             logln("Ok: " + err);
3130         } else {
3131             errln("FAIL: " + err);
3132         }
3133         return;
3134     }
3135     errln("FAIL: No syntax error");
3136 }
3137 
3138 /**
3139  * Test ID form variants
3140  */
TestIDForms()3141 void TransliteratorTest::TestIDForms() {
3142     const char* DATA[] = {
3143         "NFC", nullptr, "NFD",
3144         "nfd", nullptr, "NFC", // make sure case is ignored
3145         "Any-NFKD", nullptr, "Any-NFKC",
3146         "Null", nullptr, "Null",
3147         "-nfkc", "nfkc", "NFKD",
3148         "-nfkc/", "nfkc", "NFKD",
3149         "Latin-Greek/UNGEGN", nullptr, "Greek-Latin/UNGEGN",
3150         "Greek/UNGEGN-Latin", "Greek-Latin/UNGEGN", "Latin-Greek/UNGEGN",
3151         "Bengali-Devanagari/", "Bengali-Devanagari", "Devanagari-Bengali",
3152         "Source-", nullptr, nullptr,
3153         "Source/Variant-", nullptr, nullptr,
3154         "Source-/Variant", nullptr, nullptr,
3155         "/Variant", nullptr, nullptr,
3156         "/Variant-", nullptr, nullptr,
3157         "-/Variant", nullptr, nullptr,
3158         "-/", nullptr, nullptr,
3159         "-", nullptr, nullptr,
3160         "/", nullptr, nullptr,
3161     };
3162     const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3163 
3164     for (int32_t i=0; i<DATA_length; i+=3) {
3165         const char* ID = DATA[i];
3166         const char* expID = DATA[i+1];
3167         const char* expInvID = DATA[i+2];
3168         UBool expValid = (expInvID != nullptr);
3169         if (expID == nullptr) {
3170             expID = ID;
3171         }
3172         UParseError pe;
3173         UErrorCode ec = U_ZERO_ERROR;
3174         Transliterator *t =
3175             Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
3176         if (U_FAILURE(ec)) {
3177             if (!expValid) {
3178                 logln((UnicodeString)"Ok: getInstance(" + ID +") => " + u_errorName(ec));
3179             } else {
3180                 dataerrln((UnicodeString)"FAIL: Couldn't create " + ID + " - " + u_errorName(ec));
3181             }
3182             delete t;
3183             continue;
3184         }
3185         Transliterator *u = t->createInverse(ec);
3186         if (U_FAILURE(ec)) {
3187             errln((UnicodeString)"FAIL: Couldn't create inverse of " + ID);
3188             delete t;
3189             delete u;
3190             continue;
3191         }
3192         if (t->getID() == expID &&
3193             u->getID() == expInvID) {
3194             logln((UnicodeString)"Ok: " + ID + ".getInverse() => " + expInvID);
3195         } else {
3196             errln((UnicodeString)"FAIL: getInstance(" + ID + ") => " +
3197                   t->getID() + " x getInverse() => " + u->getID() +
3198                   ", expected " + expInvID);
3199         }
3200         delete t;
3201         delete u;
3202     }
3203 }
3204 
3205 static const char16_t SPACE[]   = {32,0};
3206 static const char16_t NEWLINE[] = {10,0};
3207 static const char16_t RETURN[]  = {13,0};
3208 static const char16_t EMPTY[]   = {0};
3209 
checkRules(const UnicodeString & label,Transliterator & t2,const UnicodeString & testRulesForward)3210 void TransliteratorTest::checkRules(const UnicodeString& label, Transliterator& t2,
3211                                     const UnicodeString& testRulesForward) {
3212     UnicodeString rules2; t2.toRules(rules2, true);
3213     //rules2 = TestUtility.replaceAll(rules2, new UnicodeSet("[' '\n\r]"), "");
3214     rules2.findAndReplace(SPACE, EMPTY);
3215     rules2.findAndReplace(NEWLINE, EMPTY);
3216     rules2.findAndReplace(RETURN, EMPTY);
3217 
3218     UnicodeString testRules(testRulesForward); testRules.findAndReplace(SPACE, EMPTY);
3219 
3220     if (rules2 != testRules) {
3221         errln(label);
3222         logln((UnicodeString)"GENERATED RULES: " + rules2);
3223         logln((UnicodeString)"SHOULD BE:       " + testRulesForward);
3224     }
3225 }
3226 
3227 /**
3228  * Mark's toRules test.
3229  */
TestToRulesMark()3230 void TransliteratorTest::TestToRulesMark() {
3231     const char* testRules =
3232         "::[[:Latin:][:Mark:]];"
3233         "::NFKD (NFC);"
3234         "::Lower (Lower);"
3235         "a <> \\u03B1;" // alpha
3236         "::NFKC (NFD);"
3237         "::Upper (Lower);"
3238         "::Lower ();"
3239         "::([[:Greek:][:Mark:]]);"
3240         ;
3241     const char* testRulesForward =
3242         "::[[:Latin:][:Mark:]];"
3243         "::NFKD(NFC);"
3244         "::Lower(Lower);"
3245         "a > \\u03B1;"
3246         "::NFKC(NFD);"
3247         "::Upper (Lower);"
3248         "::Lower ();"
3249         ;
3250     const char* testRulesBackward =
3251         "::[[:Greek:][:Mark:]];"
3252         "::Lower (Upper);"
3253         "::NFD(NFKC);"
3254         "\\u03B1 > a;"
3255         "::Lower(Lower);"
3256         "::NFC(NFKD);"
3257         ;
3258     UnicodeString source = CharsToUnicodeString("\\u00E1"); // a-acute
3259     UnicodeString target = CharsToUnicodeString("\\u03AC"); // alpha-acute
3260 
3261     UParseError pe;
3262     UErrorCode ec = U_ZERO_ERROR;
3263     LocalPointer<Transliterator> t2(
3264             Transliterator::createFromRules("source-target", UnicodeString(testRules, -1, US_INV), UTRANS_FORWARD, pe, ec));
3265     LocalPointer<Transliterator> t3(
3266             Transliterator::createFromRules("target-source", UnicodeString(testRules, -1, US_INV), UTRANS_REVERSE, pe, ec));
3267 
3268     if (U_FAILURE(ec)) {
3269         dataerrln((UnicodeString)"FAIL: createFromRules => " + u_errorName(ec));
3270         return;
3271     }
3272 
3273     expect(*t2, source, target);
3274     expect(*t3, target, source);
3275 
3276     checkRules("Failed toRules FORWARD", *t2, UnicodeString(testRulesForward, -1, US_INV));
3277     checkRules("Failed toRules BACKWARD", *t3, UnicodeString(testRulesBackward, -1, US_INV));
3278 }
3279 
3280 /**
3281  * Test Escape and Unescape transliterators.
3282  */
TestEscape()3283 void TransliteratorTest::TestEscape() {
3284     UParseError pe;
3285     UErrorCode ec;
3286     Transliterator *t;
3287 
3288     ec = U_ZERO_ERROR;
3289     t = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, pe, ec);
3290     if (U_FAILURE(ec)) {
3291         errln((UnicodeString)"FAIL: createInstance");
3292     } else {
3293         expect(*t,
3294                UNICODE_STRING_SIMPLE("\\x{40}\\U00000031&#x32;&#81;"),
3295                "@12Q");
3296     }
3297     delete t;
3298 
3299     ec = U_ZERO_ERROR;
3300     t = Transliterator::createInstance("Any-Hex/C", UTRANS_FORWARD, pe, ec);
3301     if (U_FAILURE(ec)) {
3302         errln((UnicodeString)"FAIL: createInstance");
3303     } else {
3304         expect(*t,
3305                CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3306                UNICODE_STRING_SIMPLE("\\u0041\\U0010BEEF\\uFEED"));
3307     }
3308     delete t;
3309 
3310     ec = U_ZERO_ERROR;
3311     t = Transliterator::createInstance("Any-Hex/Java", UTRANS_FORWARD, pe, ec);
3312     if (U_FAILURE(ec)) {
3313         errln((UnicodeString)"FAIL: createInstance");
3314     } else {
3315         expect(*t,
3316                CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3317                UNICODE_STRING_SIMPLE("\\u0041\\uDBEF\\uDEEF\\uFEED"));
3318     }
3319     delete t;
3320 
3321     ec = U_ZERO_ERROR;
3322     t = Transliterator::createInstance("Any-Hex/Perl", UTRANS_FORWARD, pe, ec);
3323     if (U_FAILURE(ec)) {
3324         errln((UnicodeString)"FAIL: createInstance");
3325     } else {
3326         expect(*t,
3327                CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3328                UNICODE_STRING_SIMPLE("\\x{41}\\x{10BEEF}\\x{FEED}"));
3329     }
3330     delete t;
3331 }
3332 
3333 
TestAnchorMasking()3334 void TransliteratorTest::TestAnchorMasking(){
3335     UnicodeString rule ("^a > Q; a > q;");
3336     UErrorCode status= U_ZERO_ERROR;
3337     UParseError parseError;
3338 
3339     Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status);
3340     if(U_FAILURE(status)){
3341         errln(UnicodeString("FAIL: ") + "ID" +
3342               ".createFromRules() => bad rules" +
3343               /*", parse error " + parseError.code +*/
3344               ", line " + parseError.line +
3345               ", offset " + parseError.offset +
3346               ", context " + prettify(parseError.preContext, true) +
3347               ", rules: " + prettify(rule, true));
3348     }
3349     delete t;
3350 }
3351 
3352 /**
3353  * Make sure display names of variants look reasonable.
3354  */
TestDisplayName()3355 void TransliteratorTest::TestDisplayName() {
3356 #if UCONFIG_NO_FORMATTING
3357     logln("Skipping, UCONFIG_NO_FORMATTING is set\n");
3358     return;
3359 #else
3360     static const char* DATA[] = {
3361         // ID, forward name, reverse name
3362         // Update the text as necessary -- the important thing is
3363         // not the text itself, but how various cases are handled.
3364 
3365         // Basic test
3366         "Any-Hex", "Any to Hex Escape", "Hex Escape to Any",
3367 
3368         // Variants
3369         "Any-Hex/Perl", "Any to Hex Escape/Perl", "Hex Escape to Any/Perl",
3370 
3371         // Target-only IDs
3372         "NFC", "Any to NFC", "Any to NFD",
3373     };
3374 
3375     int32_t DATA_length = UPRV_LENGTHOF(DATA);
3376 
3377     Locale US("en", "US");
3378 
3379     for (int32_t i=0; i<DATA_length; i+=3) {
3380         UnicodeString name;
3381         Transliterator::getDisplayName(DATA[i], US, name);
3382         if (name != DATA[i+1]) {
3383             dataerrln((UnicodeString)"FAIL: " + DATA[i] + ".getDisplayName() => " +
3384                   name + ", expected " + DATA[i+1]);
3385         } else {
3386             logln((UnicodeString)"Ok: " + DATA[i] + ".getDisplayName() => " + name);
3387         }
3388         UErrorCode ec = U_ZERO_ERROR;
3389         UParseError pe;
3390         Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_REVERSE, pe, ec);
3391         if (U_FAILURE(ec)) {
3392             delete t;
3393             dataerrln("FAIL: createInstance failed - %s", u_errorName(ec));
3394             continue;
3395         }
3396         name = Transliterator::getDisplayName(t->getID(), US, name);
3397         if (name != DATA[i+2]) {
3398             dataerrln((UnicodeString)"FAIL: " + t->getID() + ".getDisplayName() => " +
3399                   name + ", expected " + DATA[i+2]);
3400         } else {
3401             logln((UnicodeString)"Ok: " + t->getID() + ".getDisplayName() => " + name);
3402         }
3403         delete t;
3404     }
3405 #endif
3406 }
3407 
TestSpecialCases()3408 void TransliteratorTest::TestSpecialCases() {
3409     const UnicodeString registerRules[] = {
3410         "Any-Dev1", "x > X; y > Y;",
3411         "Any-Dev2", "XY > Z",
3412         "Greek-Latin/FAKE",
3413             CharsToUnicodeString
3414             ("[^[:L:][:M:]] { \\u03bc\\u03c0 > b ; \\u03bc\\u03c0 } [^[:L:][:M:]] > b ; [^[:L:][:M:]] { [\\u039c\\u03bc][\\u03a0\\u03c0] > B ; [\\u039c\\u03bc][\\u03a0\\u03c0] } [^[:L:][:M:]] > B ;"),
3415         "" // END MARKER
3416     };
3417 
3418     const UnicodeString testCases[] = {
3419         // NORMALIZATION
3420         // should add more test cases
3421         "NFD" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3422         "NFC" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3423         "NFKD", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3424         "NFKC", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3425 
3426         // mp -> b BUG
3427         "Greek-Latin/UNGEGN", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3428         "Greek-Latin/FAKE", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3429 
3430         // check for devanagari bug
3431         "nfd;Dev1;Dev2;nfc", "xy", "Z",
3432 
3433         // ff, i, dotless-i, I, dotted-I, LJLjlj deseret deeDEE
3434         "Title", CharsToUnicodeString("ab'cD ffi\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3435                  CharsToUnicodeString("Ab'cd Ffi\\u0131ii\\u0307 \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3436 
3437         //TODO: enable this test once Titlecase works right
3438         /*
3439         "Title", CharsToUnicodeString("\\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3440                  CharsToUnicodeString("Ffi\\u0131ii \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3441                  */
3442         "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3443                  CharsToUnicodeString("AB'CD FFIII\\u0130 \\u01C7\\u01C7\\u01C7 ") + DESERET_DEE + DESERET_DEE,
3444         "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3445                  CharsToUnicodeString("ab'cd \\uFB00i\\u0131ii\\u0307 \\u01C9\\u01C9\\u01C9 ") + DESERET_dee + DESERET_dee,
3446 
3447         "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3448         "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3449 
3450          // FORMS OF S
3451         "Greek-Latin/UNGEGN",  CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3452                                CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3453         "Latin-Greek/UNGEGN",  CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3454                                CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3") ,
3455         "Greek-Latin",  CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3456                         CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3457         "Latin-Greek",  CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3458                         CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3459         // Tatiana bug
3460         // Upper: TAT\\u02B9\\u00C2NA
3461         // Lower: tat\\u02B9\\u00E2na
3462         // Title: Tat\\u02B9\\u00E2na
3463         "Upper", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3464                  CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3465         "Lower", CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3466                  CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3467         "Title", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3468                  CharsToUnicodeString("Tat\\u02B9\\u00E2na"),
3469 
3470         "" // END MARKER
3471     };
3472 
3473     UParseError pos;
3474     int32_t i;
3475     for (i = 0; registerRules[i].length()!=0; i+=2) {
3476         UErrorCode status = U_ZERO_ERROR;
3477 
3478         Transliterator *t = Transliterator::createFromRules(registerRules[0+i],
3479             registerRules[i+1], UTRANS_FORWARD, pos, status);
3480         if (U_FAILURE(status)) {
3481             dataerrln("Fails: Unable to create the transliterator from rules. - %s", u_errorName(status));
3482         } else {
3483             Transliterator::registerInstance(t);
3484         }
3485     }
3486     for (i = 0; testCases[i].length()!=0; i+=3) {
3487         UErrorCode ec = U_ZERO_ERROR;
3488         UParseError pe;
3489         const UnicodeString& name = testCases[i];
3490         Transliterator *t = Transliterator::createInstance(name, UTRANS_FORWARD, pe, ec);
3491         if (U_FAILURE(ec)) {
3492             dataerrln((UnicodeString)"FAIL: Couldn't create " + name + " - " + u_errorName(ec));
3493             delete t;
3494             continue;
3495         }
3496         const UnicodeString& id = t->getID();
3497         const UnicodeString& source = testCases[i+1];
3498         UnicodeString target;
3499 
3500         // Automatic generation of targets, to make it simpler to add test cases (and more fail-safe)
3501 
3502         if (testCases[i+2].length() > 0) {
3503             target = testCases[i+2];
3504         } else if (0==id.caseCompare("NFD", U_FOLD_CASE_DEFAULT)) {
3505             Normalizer::normalize(source, UNORM_NFD, 0, target, ec);
3506         } else if (0==id.caseCompare("NFC", U_FOLD_CASE_DEFAULT)) {
3507             Normalizer::normalize(source, UNORM_NFC, 0, target, ec);
3508         } else if (0==id.caseCompare("NFKD", U_FOLD_CASE_DEFAULT)) {
3509             Normalizer::normalize(source, UNORM_NFKD, 0, target, ec);
3510         } else if (0==id.caseCompare("NFKC", U_FOLD_CASE_DEFAULT)) {
3511             Normalizer::normalize(source, UNORM_NFKC, 0, target, ec);
3512         } else if (0==id.caseCompare("Lower", U_FOLD_CASE_DEFAULT)) {
3513             target = source;
3514             target.toLower(Locale::getUS());
3515         } else if (0==id.caseCompare("Upper", U_FOLD_CASE_DEFAULT)) {
3516             target = source;
3517             target.toUpper(Locale::getUS());
3518         }
3519         if (U_FAILURE(ec)) {
3520             errln((UnicodeString)"FAIL: Internal error normalizing " + source);
3521             continue;
3522         }
3523 
3524         expect(*t, source, target);
3525         delete t;
3526     }
3527     for (i = 0; registerRules[i].length()!=0; i+=2) {
3528         Transliterator::unregister(registerRules[i]);
3529     }
3530 }
3531 
Char32ToEscapedChars(UChar32 ch,char * buffer,size_t n)3532 char* Char32ToEscapedChars(UChar32 ch, char* buffer, size_t n) {
3533     if (ch <= 0xFFFF) {
3534         snprintf(buffer, n, "\\u%04x", (int)ch);
3535     } else {
3536         snprintf(buffer, n, "\\U%08x", (int)ch);
3537     }
3538     return buffer;
3539 }
3540 
TestSurrogateCasing()3541 void TransliteratorTest::TestSurrogateCasing() {
3542     // check that casing handles surrogates
3543     // titlecase is currently defective
3544     char buffer[20];
3545     char16_t buffer2[20];
3546     UChar32 dee;
3547     U16_GET(DESERET_dee,0, 0, DESERET_dee.length(), dee);
3548     UnicodeString DEE(u_totitle(dee));
3549     if (DEE != DESERET_DEE) {
3550         err("Fails titlecase of surrogates");
3551         err(Char32ToEscapedChars(dee, buffer, sizeof(buffer)));
3552         err(", ");
3553         errln(Char32ToEscapedChars(DEE.char32At(0), buffer, sizeof(buffer)));
3554     }
3555 
3556     UnicodeString deeDEETest=DESERET_dee + DESERET_DEE;
3557     UnicodeString deedeeTest = DESERET_dee + DESERET_dee;
3558     UnicodeString DEEDEETest = DESERET_DEE + DESERET_DEE;
3559     UErrorCode status= U_ZERO_ERROR;
3560 
3561     u_strToUpper(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), nullptr, &status);
3562     if (U_FAILURE(status) || (UnicodeString(buffer2)!= DEEDEETest)) {
3563         errln("Fails: Can't uppercase surrogates.");
3564     }
3565 
3566     status= U_ZERO_ERROR;
3567     u_strToLower(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), nullptr, &status);
3568     if (U_FAILURE(status) || (UnicodeString(buffer2)!= deedeeTest)) {
3569         errln("Fails: Can't lowercase surrogates.");
3570     }
3571 }
3572 
_trans(Transliterator & t,const UnicodeString & src,UnicodeString & result)3573 static void _trans(Transliterator& t, const UnicodeString& src,
3574                    UnicodeString& result) {
3575     result = src;
3576     t.transliterate(result);
3577 }
3578 
_trans(const UnicodeString & id,const UnicodeString & src,UnicodeString & result,UErrorCode ec)3579 static void _trans(const UnicodeString& id, const UnicodeString& src,
3580                    UnicodeString& result, UErrorCode ec) {
3581     UParseError pe;
3582     Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
3583     if (U_SUCCESS(ec)) {
3584         _trans(*t, src, result);
3585     }
3586     delete t;
3587 }
3588 
_findMatch(const UnicodeString & source,const UnicodeString * pairs)3589 static UnicodeString _findMatch(const UnicodeString& source,
3590                                        const UnicodeString* pairs) {
3591     UnicodeString empty;
3592     for (int32_t i=0; pairs[i].length() > 0; i+=2) {
3593         if (0==source.caseCompare(pairs[i], U_FOLD_CASE_DEFAULT)) {
3594             return pairs[i+1];
3595         }
3596     }
3597     return empty;
3598 }
3599 
3600 // Check to see that incremental gets at least part way through a reasonable string.
3601 
TestIncrementalProgress()3602 void TransliteratorTest::TestIncrementalProgress() {
3603     UErrorCode ec = U_ZERO_ERROR;
3604     UnicodeString latinTest = "The Quick Brown Fox.";
3605     UnicodeString devaTest;
3606     _trans("Latin-Devanagari", latinTest, devaTest, ec);
3607     UnicodeString kataTest;
3608     _trans("Latin-Katakana", latinTest, kataTest, ec);
3609     if (U_FAILURE(ec)) {
3610         errln("FAIL: Internal error");
3611         return;
3612     }
3613     const UnicodeString tests[] = {
3614         "Any", latinTest,
3615         "Latin", latinTest,
3616         "Halfwidth", latinTest,
3617         "Devanagari", devaTest,
3618         "Katakana", kataTest,
3619         "" // END MARKER
3620     };
3621 
3622     UnicodeString test("The Quick Brown Fox Jumped Over The Lazy Dog.");
3623     int32_t i = 0, j=0, k=0;
3624     int32_t sources = Transliterator::countAvailableSources();
3625     for (i = 0; i < sources; i++) {
3626         UnicodeString source;
3627         Transliterator::getAvailableSource(i, source);
3628         UnicodeString test = _findMatch(source, tests);
3629         if (test.length() == 0) {
3630             logln((UnicodeString)"Skipping " + source + "-X");
3631             continue;
3632         }
3633         int32_t targets = Transliterator::countAvailableTargets(source);
3634         for (j = 0; j < targets; j++) {
3635             UnicodeString target;
3636             Transliterator::getAvailableTarget(j, source, target);
3637             int32_t variants = Transliterator::countAvailableVariants(source, target);
3638             for (k =0; k< variants; k++) {
3639                 UnicodeString variant;
3640                 UParseError err;
3641                 UErrorCode status = U_ZERO_ERROR;
3642 
3643                 Transliterator::getAvailableVariant(k, source, target, variant);
3644                 UnicodeString id = source + "-" + target + "/" + variant;
3645 
3646                 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, err, status);
3647                 if (U_FAILURE(status)) {
3648                     dataerrln((UnicodeString)"FAIL: Could not create " + id + ", status " + u_errorName(status));
3649                     delete t;
3650                     continue;
3651                 }
3652                 status = U_ZERO_ERROR;
3653                 CheckIncrementalAux(t, test);
3654 
3655                 UnicodeString rev;
3656                 _trans(*t, test, rev);
3657                 Transliterator *inv = t->createInverse(status);
3658                 if (U_FAILURE(status)) {
3659                     // The following are forward-only, it is OK that creating an inverse will not work:
3660                     // 1. Devanagari-Arabic
3661                     // 2. Any-*/BGN
3662                     // 2a. Any-*/BGN_1981
3663                     // 3. Any-*/MNS
3664                     // 3a. Any-*/Geminate[d]
3665                     //
3666                     // 4. If UCONFIG_NO_BREAK_ITERATION is on, Latin-Thai is also not expected to work.
3667                     //
3668                     // The following are direction="both" transforms with variants, inverting the Any-Xxxx/Variant for
3669                     // any of these does not work; see ICU-21911 (not sure whether this is intentional or an ICU bug).
3670                     // Unfortunately we do not easily have the info at this point as to whether the original transform
3671                     // had direction="both" specified.
3672                     // 5. Any-*/UNGEGN
3673                     // 6. Any-Ethiopic/*
3674                     // 7. Any-Braille/*
3675                     // 8. Any-*/Gurage_2013
3676                     // 9. Any-*/Gutgarts
3677                     // 10. Any-*/Tekie_Alibekit
3678                     // 11. Any-*/Xaleget
3679                     //
3680                     if (    id.compare((UnicodeString)"Devanagari-Arabic/") != 0
3681                          && !(id.startsWith((UnicodeString)"Any-") &&
3682                                 (id.endsWith((UnicodeString)"/BGN") || id.endsWith((UnicodeString)"/BGN_1981") || id.endsWith((UnicodeString)"/MNS") ||
3683                                  id.endsWith((UnicodeString)"/Geminate") || id.endsWith((UnicodeString)"/Geminated"))
3684                              )
3685 #if UCONFIG_NO_BREAK_ITERATION
3686                          && id.compare((UnicodeString)"Latin-Thai/") != 0
3687 #endif
3688                          && !(logKnownIssue("21911", "ICU4C cannot create inverse of Any-Xxxx/Variant transform created from both-direction transform") &&
3689                                 id.startsWith((UnicodeString)"Any-") &&
3690                                 (id.endsWith((UnicodeString)"/UNGEGN") || id.startsWith((UnicodeString)"Any-Ethiopic/") || id.startsWith((UnicodeString)"Any-Braille/") ||
3691                                  id.endsWith((UnicodeString)"/Gurage_2013") || id.endsWith((UnicodeString)"/Gutgarts") || id.endsWith((UnicodeString)"/Tekie_Alibekit") ||
3692                                  id.endsWith((UnicodeString)"/Xaleget"))
3693                              )
3694                        )
3695                     {
3696                         errln((UnicodeString)"FAIL: Could not create inverse of " + id + ", status " + u_errorName(status));
3697                     }
3698                     delete t;
3699                     delete inv;
3700                     continue;
3701                 }
3702                 CheckIncrementalAux(inv, rev);
3703                 delete t;
3704                 delete inv;
3705             }
3706         }
3707     }
3708 }
3709 
CheckIncrementalAux(const Transliterator * t,const UnicodeString & input)3710 void TransliteratorTest::CheckIncrementalAux(const Transliterator* t,
3711                                                       const UnicodeString& input) {
3712     UErrorCode ec = U_ZERO_ERROR;
3713     UTransPosition pos;
3714     UnicodeString test = input;
3715 
3716     pos.contextStart = 0;
3717     pos.contextLimit = input.length();
3718     pos.start = 0;
3719     pos.limit = input.length();
3720 
3721     t->transliterate(test, pos, ec);
3722     if (U_FAILURE(ec)) {
3723         errln((UnicodeString)"FAIL: transliterate() error " + u_errorName(ec));
3724         return;
3725     }
3726     UBool gotError = false;
3727     (void)gotError;    // Suppress set but not used warning.
3728 
3729     // we have a few special cases. Any-Remove (pos.start = 0, but also = limit) and U+XXXXX?X?
3730 
3731     if (pos.start == 0 && pos.limit != 0 && t->getID() != "Hex-Any/Unicode") {
3732         errln((UnicodeString)"No Progress, " +
3733               t->getID() + ": " + formatInput(test, input, pos));
3734         gotError = true;
3735     } else {
3736         logln((UnicodeString)"PASS Progress, " +
3737               t->getID() + ": " + formatInput(test, input, pos));
3738     }
3739     t->finishTransliteration(test, pos);
3740     if (pos.start != pos.limit) {
3741         errln((UnicodeString)"Incomplete, " +
3742               t->getID() + ": " + formatInput(test, input, pos));
3743         gotError = true;
3744     }
3745 }
3746 
TestFunction()3747 void TransliteratorTest::TestFunction() {
3748     // Careful with spacing and ';' here:  Phrase this exactly
3749     // as toRules() is going to return it.  If toRules() changes
3750     // with regard to spacing or ';', then adjust this string.
3751     UnicodeString rule =
3752         "([:Lu:]) > $1 '(' &Lower( $1 ) '=' &Hex( &Any-Lower( $1 ) ) ')';";
3753 
3754     UParseError pe;
3755     UErrorCode ec = U_ZERO_ERROR;
3756     Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3757     if (t == nullptr) {
3758         dataerrln("FAIL: createFromRules failed - %s", u_errorName(ec));
3759         return;
3760     }
3761 
3762     UnicodeString r;
3763     t->toRules(r, true);
3764     if (r == rule) {
3765         logln((UnicodeString)"OK: toRules() => " + r);
3766     } else {
3767         errln((UnicodeString)"FAIL: toRules() => " + r +
3768               ", expected " + rule);
3769     }
3770 
3771     expect(*t, "The Quick Brown Fox",
3772            UNICODE_STRING_SIMPLE("T(t=\\u0074)he Q(q=\\u0071)uick B(b=\\u0062)rown F(f=\\u0066)ox"));
3773 
3774     delete t;
3775 }
3776 
TestInvalidBackRef()3777 void TransliteratorTest::TestInvalidBackRef() {
3778     UnicodeString rule =  ". > $1;";
3779     UnicodeString rule2 =CharsToUnicodeString("(.) <> &hex/unicode($1) &name($1); . > $1; [{}] >\\u0020;");
3780     UParseError pe;
3781     UErrorCode ec = U_ZERO_ERROR;
3782     Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3783     Transliterator *t2 = Transliterator::createFromRules("Test2", rule2, UTRANS_FORWARD, pe, ec);
3784 
3785     if (t != nullptr) {
3786         errln("FAIL: createFromRules should have returned nullptr");
3787         delete t;
3788     }
3789 
3790     if (t2 != nullptr) {
3791         errln("FAIL: createFromRules should have returned nullptr");
3792         delete t2;
3793     }
3794 
3795     if (U_SUCCESS(ec)) {
3796         errln("FAIL: Ok: . > $1; => no error");
3797     } else {
3798         logln((UnicodeString)"Ok: . > $1; => " + u_errorName(ec));
3799     }
3800 }
3801 
TestMulticharStringSet()3802 void TransliteratorTest::TestMulticharStringSet() {
3803     // Basic testing
3804     const char* rule =
3805         "       [{aa}]       > x;"
3806         "         a          > y;"
3807         "       [b{bc}]      > z;"
3808         "[{gd}] { e          > q;"
3809         "         e } [{fg}] > r;" ;
3810 
3811     UParseError pe;
3812     UErrorCode ec = U_ZERO_ERROR;
3813     Transliterator* t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3814     if (t == nullptr || U_FAILURE(ec)) {
3815         delete t;
3816         errln("FAIL: createFromRules failed");
3817         return;
3818     }
3819 
3820     expect(*t, "a aa ab bc d gd de gde gdefg ddefg",
3821            "y x yz z d gd de gdq gdqfg ddrfg");
3822     delete t;
3823 
3824     // Overlapped string test.  Make sure that when multiple
3825     // strings can match that the longest one is matched.
3826     rule =
3827         "    [a {ab} {abc}]    > x;"
3828         "           b          > y;"
3829         "           c          > z;"
3830         " q [t {st} {rst}] { e > p;" ;
3831 
3832     t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3833     if (t == nullptr || U_FAILURE(ec)) {
3834         delete t;
3835         errln("FAIL: createFromRules failed");
3836         return;
3837     }
3838 
3839     expect(*t, "a ab abc qte qste qrste",
3840            "x x x qtp qstp qrstp");
3841     delete t;
3842 }
3843 
3844 // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
3845 // BEGIN TestUserFunction support factory
3846 
3847 Transliterator* _TUFF[4];
3848 UnicodeString* _TUFID[4];
3849 
_TUFFactory(const UnicodeString &,Transliterator::Token context)3850 static Transliterator* U_EXPORT2 _TUFFactory(const UnicodeString& /*ID*/,
3851                                    Transliterator::Token context) {
3852     return _TUFF[context.integer]->clone();
3853 }
3854 
_TUFReg(const UnicodeString & ID,Transliterator * t,int32_t n)3855 static void _TUFReg(const UnicodeString& ID, Transliterator* t, int32_t n) {
3856     _TUFF[n] = t;
3857     _TUFID[n] = new UnicodeString(ID);
3858     Transliterator::registerFactory(ID, _TUFFactory, Transliterator::integerToken(n));
3859 }
3860 
_TUFUnreg(int32_t n)3861 static void _TUFUnreg(int32_t n) {
3862     if (_TUFF[n] != nullptr) {
3863         Transliterator::unregister(*_TUFID[n]);
3864         delete _TUFF[n];
3865         delete _TUFID[n];
3866     }
3867 }
3868 
3869 // END TestUserFunction support factory
3870 // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3871 
3872 /**
3873  * Test that user-registered transliterators can be used under function
3874  * syntax.
3875  */
TestUserFunction()3876 void TransliteratorTest::TestUserFunction() {
3877 
3878     Transliterator* t;
3879     UParseError pe;
3880     UErrorCode ec = U_ZERO_ERROR;
3881 
3882     // Setup our factory
3883     int32_t i;
3884     for (i=0; i<4; ++i) {
3885         _TUFF[i] = nullptr;
3886     }
3887 
3888     // There's no need to register inverses if we don't use them
3889     t = Transliterator::createFromRules("gif",
3890                                         UNICODE_STRING_SIMPLE("'\\'u(..)(..) > '<img src=\"http://www.unicode.org/gifs/24/' $1 '/U' $1$2 '.gif\">';"),
3891                                         UTRANS_FORWARD, pe, ec);
3892     if (t == nullptr || U_FAILURE(ec)) {
3893         dataerrln((UnicodeString)"FAIL: createFromRules gif " + u_errorName(ec));
3894         return;
3895     }
3896     _TUFReg("Any-gif", t, 0);
3897 
3898     t = Transliterator::createFromRules("RemoveCurly",
3899                                         UNICODE_STRING_SIMPLE("[\\{\\}] > ; '\\N' > ;"),
3900                                         UTRANS_FORWARD, pe, ec);
3901     if (t == nullptr || U_FAILURE(ec)) {
3902         errln((UnicodeString)"FAIL: createFromRules RemoveCurly " + u_errorName(ec));
3903         goto FAIL;
3904     }
3905     expect(*t, UNICODE_STRING_SIMPLE("\\N{name}"), "name");
3906     _TUFReg("Any-RemoveCurly", t, 1);
3907 
3908     logln("Trying &hex");
3909     t = Transliterator::createFromRules("hex2",
3910                                         "(.) > &hex($1);",
3911                                         UTRANS_FORWARD, pe, ec);
3912     if (t == nullptr || U_FAILURE(ec)) {
3913         errln("FAIL: createFromRules");
3914         goto FAIL;
3915     }
3916     logln("Registering");
3917     _TUFReg("Any-hex2", t, 2);
3918     t = Transliterator::createInstance("Any-hex2", UTRANS_FORWARD, ec);
3919     if (t == nullptr || U_FAILURE(ec)) {
3920         errln((UnicodeString)"FAIL: createInstance Any-hex2 " + u_errorName(ec));
3921         goto FAIL;
3922     }
3923     expect(*t, "abc", UNICODE_STRING_SIMPLE("\\u0061\\u0062\\u0063"));
3924     delete t;
3925 
3926     logln("Trying &gif");
3927     t = Transliterator::createFromRules("gif2",
3928                                         "(.) > &Gif(&Hex2($1));",
3929                                         UTRANS_FORWARD, pe, ec);
3930     if (t == nullptr || U_FAILURE(ec)) {
3931         errln((UnicodeString)"FAIL: createFromRules gif2 " + u_errorName(ec));
3932         goto FAIL;
3933     }
3934     logln("Registering");
3935     _TUFReg("Any-gif2", t, 3);
3936     t = Transliterator::createInstance("Any-gif2", UTRANS_FORWARD, ec);
3937     if (t == nullptr || U_FAILURE(ec)) {
3938         errln((UnicodeString)"FAIL: createInstance Any-gif2 " + u_errorName(ec));
3939         goto FAIL;
3940     }
3941     expect(*t, "ab", "<img src=\"http://www.unicode.org/gifs/24/00/U0061.gif\">"
3942            "<img src=\"http://www.unicode.org/gifs/24/00/U0062.gif\">");
3943     delete t;
3944 
3945     // Test that filters are allowed after &
3946     t = Transliterator::createFromRules("test",
3947                                         "(.) > &Hex($1) ' ' &RemoveCurly(&Name($1)) ' ';",
3948                                         UTRANS_FORWARD, pe, ec);
3949     if (t == nullptr || U_FAILURE(ec)) {
3950         errln((UnicodeString)"FAIL: createFromRules test " + u_errorName(ec));
3951         goto FAIL;
3952     }
3953     expect(*t, "abc",
3954            UNICODE_STRING_SIMPLE("\\u0061 LATIN SMALL LETTER A \\u0062 LATIN SMALL LETTER B \\u0063 LATIN SMALL LETTER C "));
3955     delete t;
3956 
3957  FAIL:
3958     for (i=0; i<4; ++i) {
3959         _TUFUnreg(i);
3960     }
3961 }
3962 
3963 /**
3964  * Test the Any-X transliterators.
3965  */
TestAnyX()3966 void TransliteratorTest::TestAnyX() {
3967     UParseError parseError;
3968     UErrorCode status = U_ZERO_ERROR;
3969     Transliterator* anyLatin =
3970         Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
3971     if (anyLatin==0) {
3972         dataerrln("FAIL: createInstance returned nullptr - %s", u_errorName(status));
3973         delete anyLatin;
3974         return;
3975     }
3976 
3977     expect(*anyLatin,
3978            CharsToUnicodeString("greek:\\u03B1\\u03B2\\u03BA\\u0391\\u0392\\u039A hiragana:\\u3042\\u3076\\u304F cyrillic:\\u0430\\u0431\\u0446"),
3979            CharsToUnicodeString("greek:abkABK hiragana:abuku cyrillic:abc"));
3980 
3981     delete anyLatin;
3982 }
3983 
3984 /**
3985  * Test Any-X transliterators with sample letters from all scripts.
3986  */
TestAny()3987 void TransliteratorTest::TestAny() {
3988     UErrorCode status = U_ZERO_ERROR;
3989     // Note: there is a lot of implicit construction of UnicodeStrings from (char *) in
3990     //       function call parameters going on in this test.
3991     UnicodeSet alphabetic("[:alphabetic:]", status);
3992     if (U_FAILURE(status)) {
3993         dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
3994         return;
3995     }
3996     alphabetic.freeze();
3997 
3998     UnicodeString testString;
3999     for (int32_t i = 0; i < USCRIPT_CODE_LIMIT; i++) {
4000         const char *scriptName = uscript_getShortName((UScriptCode)i);
4001         if (scriptName == nullptr) {
4002             errln("Failure: file %s, line %d: Script Code %d is invalid, ", __FILE__, __LINE__, i);
4003             return;
4004         }
4005 
4006         UnicodeSet sample;
4007         sample.applyPropertyAlias("script", scriptName, status);
4008         if (U_FAILURE(status)) {
4009             errln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
4010             return;
4011         }
4012         sample.retainAll(alphabetic);
4013         for (int32_t count=0; count<5; count++) {
4014             UChar32 c = sample.charAt(count);
4015             if (c == -1) {
4016                 break;
4017             }
4018             testString.append(c);
4019         }
4020     }
4021 
4022     UParseError parseError;
4023     Transliterator* anyLatin =
4024         Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
4025     if (U_FAILURE(status)) {
4026         dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
4027         return;
4028     }
4029 
4030     logln(UnicodeString("Sample set for Any-Latin: ") + testString);
4031     anyLatin->transliterate(testString);
4032     logln(UnicodeString("Sample result for Any-Latin: ") + testString);
4033     delete anyLatin;
4034 }
4035 
4036 
4037 /**
4038  * Test the source and target set API.  These are only implemented
4039  * for RBT and CompoundTransliterator at this time.
4040  */
TestSourceTargetSet()4041 void TransliteratorTest::TestSourceTargetSet() {
4042     UErrorCode ec = U_ZERO_ERROR;
4043 
4044     // Rules
4045     const char* r =
4046         "a > b; "
4047         "r [x{lu}] > q;";
4048 
4049     // Expected source
4050     UnicodeSet expSrc("[arx{lu}]", ec);
4051 
4052     // Expected target
4053     UnicodeSet expTrg("[bq]", ec);
4054 
4055     UParseError pe;
4056     Transliterator* t = Transliterator::createFromRules("test", r, UTRANS_FORWARD, pe, ec);
4057 
4058     if (U_FAILURE(ec)) {
4059         delete t;
4060         errln("FAIL: Couldn't set up test");
4061         return;
4062     }
4063 
4064     UnicodeSet src; t->getSourceSet(src);
4065     UnicodeSet trg; t->getTargetSet(trg);
4066 
4067     if (src == expSrc && trg == expTrg) {
4068         UnicodeString a, b;
4069         logln((UnicodeString)"Ok: " +
4070               r + " => source = " + src.toPattern(a, true) +
4071               ", target = " + trg.toPattern(b, true));
4072     } else {
4073         UnicodeString a, b, c, d;
4074         errln((UnicodeString)"FAIL: " +
4075               r + " => source = " + src.toPattern(a, true) +
4076               ", expected " + expSrc.toPattern(b, true) +
4077               "; target = " + trg.toPattern(c, true) +
4078               ", expected " + expTrg.toPattern(d, true));
4079     }
4080 
4081     delete t;
4082 }
4083 
4084 /**
4085  * Test handling of Pattern_White_Space, for both RBT and UnicodeSet.
4086  */
TestPatternWhiteSpace()4087 void TransliteratorTest::TestPatternWhiteSpace() {
4088     // Rules
4089     const char* r = "a > \\u200E b;";
4090 
4091     UErrorCode ec = U_ZERO_ERROR;
4092     UParseError pe;
4093     Transliterator* t = Transliterator::createFromRules("test", CharsToUnicodeString(r), UTRANS_FORWARD, pe, ec);
4094 
4095     if (U_FAILURE(ec)) {
4096         errln("FAIL: Couldn't set up test");
4097     } else {
4098         expect(*t, "a", "b");
4099     }
4100     delete t;
4101 
4102     // UnicodeSet
4103     ec = U_ZERO_ERROR;
4104     UnicodeSet set(CharsToUnicodeString("[a \\u200E]"), ec);
4105 
4106     if (U_FAILURE(ec)) {
4107         errln("FAIL: Couldn't set up test");
4108     } else {
4109         if (set.contains(0x200E)) {
4110             errln("FAIL: U+200E not being ignored by UnicodeSet");
4111         }
4112     }
4113 }
4114 //======================================================================
4115 // this method is in TestUScript.java
4116 //======================================================================
TestAllCodepoints()4117 void TransliteratorTest::TestAllCodepoints(){
4118     UScriptCode code= USCRIPT_INVALID_CODE;
4119     char id[256]={'\0'};
4120     char abbr[256]={'\0'};
4121     char newId[256]={'\0'};
4122     char newAbbrId[256]={'\0'};
4123     char oldId[256]={'\0'};
4124     char oldAbbrId[256]={'\0'};
4125 
4126     UErrorCode status =U_ZERO_ERROR;
4127     UParseError pe;
4128 
4129     for(uint32_t i = 0; i<=0x10ffff; i++){
4130         code =  uscript_getScript(i,&status);
4131         if(code == USCRIPT_INVALID_CODE){
4132             dataerrln("uscript_getScript for codepoint \\U%08X failed.", i);
4133         }
4134         const char* myId = uscript_getName(code);
4135         if(!myId) {
4136           dataerrln("Valid script code returned nullptr name. Check your data!");
4137           return;
4138         }
4139         uprv_strcpy(id,myId);
4140         uprv_strcpy(abbr,uscript_getShortName(code));
4141 
4142         uprv_strcpy(newId,"[:");
4143         uprv_strcat(newId,id);
4144         uprv_strcat(newId,":];NFD");
4145 
4146         uprv_strcpy(newAbbrId,"[:");
4147         uprv_strcat(newAbbrId,abbr);
4148         uprv_strcat(newAbbrId,":];NFD");
4149 
4150         if(uprv_strcmp(newId,oldId)!=0){
4151             Transliterator* t = Transliterator::createInstance(newId,UTRANS_FORWARD,pe,status);
4152             if(t==nullptr || U_FAILURE(status)){
4153                 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4154             }
4155             delete t;
4156         }
4157         if(uprv_strcmp(newAbbrId,oldAbbrId)!=0){
4158             Transliterator* t = Transliterator::createInstance(newAbbrId,UTRANS_FORWARD,pe,status);
4159             if(t==nullptr || U_FAILURE(status)){
4160                 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4161             }
4162             delete t;
4163         }
4164         uprv_strcpy(oldId,newId);
4165         uprv_strcpy(oldAbbrId, newAbbrId);
4166 
4167     }
4168 
4169 }
4170 
4171 #define TEST_TRANSLIT_ID(id, cls) UPRV_BLOCK_MACRO_BEGIN { \
4172   UErrorCode ec = U_ZERO_ERROR; \
4173   Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); \
4174   if (U_FAILURE(ec)) { \
4175     dataerrln("FAIL: Couldn't create %s - %s", id, u_errorName(ec)); \
4176   } else { \
4177     if (t->getDynamicClassID() != cls::getStaticClassID()) { \
4178       errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4179     } \
4180     /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4181   } \
4182   delete t; \
4183 } UPRV_BLOCK_MACRO_END
4184 
4185 #define TEST_TRANSLIT_RULE(rule, cls) UPRV_BLOCK_MACRO_BEGIN { \
4186   UErrorCode ec = U_ZERO_ERROR; \
4187   UParseError pe; \
4188   Transliterator* t = Transliterator::createFromRules("_", rule, UTRANS_FORWARD, pe, ec); \
4189   if (U_FAILURE(ec)) { \
4190     errln("FAIL: Couldn't create " rule); \
4191   } else { \
4192     if (t->getDynamicClassID() != cls ::getStaticClassID()) { \
4193       errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4194     } \
4195     /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4196   } \
4197   delete t; \
4198 } UPRV_BLOCK_MACRO_END
4199 
TestBoilerplate()4200 void TransliteratorTest::TestBoilerplate() {
4201     TEST_TRANSLIT_ID("Any-Latin", AnyTransliterator);
4202     TEST_TRANSLIT_ID("Any-Hex", EscapeTransliterator);
4203     TEST_TRANSLIT_ID("Hex-Any", UnescapeTransliterator);
4204     TEST_TRANSLIT_ID("Lower", LowercaseTransliterator);
4205     TEST_TRANSLIT_ID("Upper", UppercaseTransliterator);
4206     TEST_TRANSLIT_ID("Title", TitlecaseTransliterator);
4207     TEST_TRANSLIT_ID("Null", NullTransliterator);
4208     TEST_TRANSLIT_ID("Remove", RemoveTransliterator);
4209     TEST_TRANSLIT_ID("Any-Name", UnicodeNameTransliterator);
4210     TEST_TRANSLIT_ID("Name-Any", NameUnicodeTransliterator);
4211     TEST_TRANSLIT_ID("NFD", NormalizationTransliterator);
4212     TEST_TRANSLIT_ID("Latin-Greek", CompoundTransliterator);
4213     TEST_TRANSLIT_RULE("a>b;", RuleBasedTransliterator);
4214 }
4215 
TestAlternateSyntax()4216 void TransliteratorTest::TestAlternateSyntax() {
4217     // U+2206 == &
4218     // U+2190 == <
4219     // U+2192 == >
4220     // U+2194 == <>
4221     expect(CharsToUnicodeString("a \\u2192 x; b \\u2190 y; c \\u2194 z"),
4222            "abc",
4223            "xbz");
4224     expect(CharsToUnicodeString("([:^ASCII:]) \\u2192 \\u2206Name($1);"),
4225            CharsToUnicodeString("<=\\u2190; >=\\u2192; <>=\\u2194; &=\\u2206"),
4226            UNICODE_STRING_SIMPLE("<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}"));
4227 }
4228 
4229 static const char* BEGIN_END_RULES[] = {
4230     // [0]
4231     "abc > xy;"
4232     "aba > z;",
4233 
4234     // [1]
4235 /*
4236     "::BEGIN;"
4237     "abc > xy;"
4238     "::END;"
4239     "::BEGIN;"
4240     "aba > z;"
4241     "::END;",
4242 */
4243     "", // test case commented out below, this is here to keep from messing up the indexes
4244 
4245     // [2]
4246 /*
4247     "abc > xy;"
4248     "::BEGIN;"
4249     "aba > z;"
4250     "::END;",
4251 */
4252     "", // test case commented out below, this is here to keep from messing up the indexes
4253 
4254     // [3]
4255 /*
4256     "::BEGIN;"
4257     "abc > xy;"
4258     "::END;"
4259     "aba > z;",
4260 */
4261     "", // test case commented out below, this is here to keep from messing up the indexes
4262 
4263     // [4]
4264     "abc > xy;"
4265     "::Null;"
4266     "aba > z;",
4267 
4268     // [5]
4269     "::Upper;"
4270     "ABC > xy;"
4271     "AB > x;"
4272     "C > z;"
4273     "::Upper;"
4274     "XYZ > p;"
4275     "XY > q;"
4276     "Z > r;"
4277     "::Upper;",
4278 
4279     // [6]
4280     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4281     "$delim = [\\-$ws];"
4282     "$ws $delim* > ' ';"
4283     "'-' $delim* > '-';",
4284 
4285     // [7]
4286     "::Null;"
4287     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4288     "$delim = [\\-$ws];"
4289     "$ws $delim* > ' ';"
4290     "'-' $delim* > '-';",
4291 
4292     // [8]
4293     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4294     "$delim = [\\-$ws];"
4295     "$ws $delim* > ' ';"
4296     "'-' $delim* > '-';"
4297     "::Null;",
4298 
4299     // [9]
4300     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4301     "$delim = [\\-$ws];"
4302     "::Null;"
4303     "$ws $delim* > ' ';"
4304     "'-' $delim* > '-';",
4305 
4306     // [10]
4307 /*
4308     "::BEGIN;"
4309     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4310     "$delim = [\\-$ws];"
4311     "::END;"
4312     "$ws $delim* > ' ';"
4313     "'-' $delim* > '-';",
4314 */
4315     "", // test case commented out below, this is here to keep from messing up the indexes
4316 
4317     // [11]
4318 /*
4319     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4320     "$delim = [\\-$ws];"
4321     "::BEGIN;"
4322     "$ws $delim* > ' ';"
4323     "'-' $delim* > '-';"
4324     "::END;",
4325 */
4326     "", // test case commented out below, this is here to keep from messing up the indexes
4327 
4328     // [12]
4329 /*
4330     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4331     "$delim = [\\-$ws];"
4332     "$ab = [ab];"
4333     "::BEGIN;"
4334     "$ws $delim* > ' ';"
4335     "'-' $delim* > '-';"
4336     "::END;"
4337     "::BEGIN;"
4338     "$ab { ' ' } $ab > '-';"
4339     "c { ' ' > ;"
4340     "::END;"
4341     "::BEGIN;"
4342     "'a-a' > a\\%|a;"
4343     "::END;",
4344 */
4345     "", // test case commented out below, this is here to keep from messing up the indexes
4346 
4347     // [13]
4348     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4349     "$delim = [\\-$ws];"
4350     "$ab = [ab];"
4351     "::Null;"
4352     "$ws $delim* > ' ';"
4353     "'-' $delim* > '-';"
4354     "::Null;"
4355     "$ab { ' ' } $ab > '-';"
4356     "c { ' ' > ;"
4357     "::Null;"
4358     "'a-a' > a\\%|a;",
4359 
4360     // [14]
4361 /*
4362     "::[abc];"
4363     "::BEGIN;"
4364     "abc > xy;"
4365     "::END;"
4366     "::BEGIN;"
4367     "aba > yz;"
4368     "::END;"
4369     "::Upper;",
4370 */
4371     "", // test case commented out below, this is here to keep from messing up the indexes
4372 
4373     // [15]
4374     "::[abc];"
4375     "abc > xy;"
4376     "::Null;"
4377     "aba > yz;"
4378     "::Upper;",
4379 
4380     // [16]
4381 /*
4382     "::[abc];"
4383     "::BEGIN;"
4384     "abc <> xy;"
4385     "::END;"
4386     "::BEGIN;"
4387     "aba <> yz;"
4388     "::END;"
4389     "::Upper(Lower);"
4390     "::([XYZ]);"
4391 */
4392     "", // test case commented out below, this is here to keep from messing up the indexes
4393 
4394     // [17]
4395     "::[abc];"
4396     "abc <> xy;"
4397     "::Null;"
4398     "aba <> yz;"
4399     "::Upper(Lower);"
4400     "::([XYZ]);"
4401 };
4402 
4403 /*
4404 (This entire test is commented out below and will need some heavy revision when we re-add
4405 the ::BEGIN/::END stuff)
4406 static const char* BOGUS_BEGIN_END_RULES[] = {
4407     // [7]
4408     "::BEGIN;"
4409     "abc > xy;"
4410     "::BEGIN;"
4411     "aba > z;"
4412     "::END;"
4413     "::END;",
4414 
4415     // [8]
4416     "abc > xy;"
4417     " aba > z;"
4418     "::END;",
4419 
4420     // [9]
4421     "::BEGIN;"
4422     "::Upper;"
4423     "::END;"
4424 };
4425 static const int32_t BOGUS_BEGIN_END_RULES_length = UPRV_LENGTHOF(BOGUS_BEGIN_END_RULES);
4426 */
4427 
4428 static const char* BEGIN_END_TEST_CASES[] = {
4429     // rules             input                   expected output
4430     BEGIN_END_RULES[0],  "abc ababc aba",        "xy zbc z",
4431 //    BEGIN_END_RULES[1],  "abc ababc aba",        "xy abxy z",
4432 //    BEGIN_END_RULES[2],  "abc ababc aba",        "xy abxy z",
4433 //    BEGIN_END_RULES[3],  "abc ababc aba",        "xy abxy z",
4434     BEGIN_END_RULES[4],  "abc ababc aba",        "xy abxy z",
4435     BEGIN_END_RULES[5],  "abccabaacababcbc",     "PXAARXQBR",
4436 
4437     BEGIN_END_RULES[6],  "e   e - e---e-  e",    "e e e-e-e",
4438     BEGIN_END_RULES[7],  "e   e - e---e-  e",    "e e e-e-e",
4439     BEGIN_END_RULES[8],  "e   e - e---e-  e",    "e e e-e-e",
4440     BEGIN_END_RULES[9],  "e   e - e---e-  e",    "e e e-e-e",
4441 //    BEGIN_END_RULES[10],  "e   e - e---e-  e",    "e e e-e-e",
4442 //    BEGIN_END_RULES[11], "e   e - e---e-  e",    "e e e-e-e",
4443 //    BEGIN_END_RULES[12], "e   e - e---e-  e",    "e e e-e-e",
4444 //    BEGIN_END_RULES[12], "a    a    a    a",     "a%a%a%a",
4445 //    BEGIN_END_RULES[12], "a a-b c b a",          "a%a-b cb-a",
4446     BEGIN_END_RULES[13], "e   e - e---e-  e",    "e e e-e-e",
4447     BEGIN_END_RULES[13], "a    a    a    a",     "a%a%a%a",
4448     BEGIN_END_RULES[13], "a a-b c b a",          "a%a-b cb-a",
4449 
4450 //    BEGIN_END_RULES[14], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4451     BEGIN_END_RULES[15], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4452 //    BEGIN_END_RULES[16], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4453     BEGIN_END_RULES[17], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ"
4454 };
4455 static const int32_t BEGIN_END_TEST_CASES_length = UPRV_LENGTHOF(BEGIN_END_TEST_CASES);
4456 
TestBeginEnd()4457 void TransliteratorTest::TestBeginEnd() {
4458     // run through the list of test cases above
4459     int32_t i = 0;
4460     for (i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4461         expect((UnicodeString)"Test case #" + (i / 3),
4462                UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4463                UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4464                UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4465     }
4466 
4467     // instantiate the one reversible rule set in the reverse direction and make sure it does the right thing
4468     UParseError parseError;
4469     UErrorCode status = U_ZERO_ERROR;
4470     Transliterator* reversed  = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4471             UTRANS_REVERSE, parseError, status);
4472     if (reversed == 0 || U_FAILURE(status)) {
4473         reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4474     } else {
4475         expect(*reversed, UnicodeString("xy XY XYZ yz YZ"), UnicodeString("xy abc xaba yz aba"));
4476     }
4477     delete reversed;
4478 
4479     // finally, run through the list of syntactically-ill-formed rule sets above and make sure
4480     // that all of them cause errors
4481 /*
4482 (commented out until we have the real ::BEGIN/::END stuff in place
4483     for (i = 0; i < BOGUS_BEGIN_END_RULES_length; i++) {
4484         UParseError parseError;
4485         UErrorCode status = U_ZERO_ERROR;
4486         Transliterator* t = Transliterator::createFromRules("foo", UnicodeString(BOGUS_BEGIN_END_RULES[i]),
4487                 UTRANS_FORWARD, parseError, status);
4488         if (!U_FAILURE(status)) {
4489             delete t;
4490             errln((UnicodeString)"Should have gotten syntax error from " + BOGUS_BEGIN_END_RULES[i]);
4491         }
4492     }
4493 */
4494 }
4495 
TestBeginEndToRules()4496 void TransliteratorTest::TestBeginEndToRules() {
4497     // run through the same list of test cases we used above, but this time, instead of just
4498     // instantiating a Transliterator from the rules and running the test against it, we instantiate
4499     // a Transliterator from the rules, do toRules() on it, instantiate a Transliterator from
4500     // the resulting set of rules, and make sure that the generated rule set is semantically equivalent
4501     // to (i.e., does the same thing as) the original rule set
4502     for (int32_t i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4503         UParseError parseError;
4504         UErrorCode status = U_ZERO_ERROR;
4505         Transliterator* t = Transliterator::createFromRules("--", UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4506                 UTRANS_FORWARD, parseError, status);
4507         if (U_FAILURE(status)) {
4508             reportParseError(UnicodeString("FAIL: Couldn't create transliterator"), parseError, status);
4509         } else {
4510             UnicodeString rules;
4511             t->toRules(rules, true);
4512             Transliterator* t2 = Transliterator::createFromRules((UnicodeString)"Test case #" + (i / 3), rules,
4513                     UTRANS_FORWARD, parseError, status);
4514             if (U_FAILURE(status)) {
4515                 reportParseError(UnicodeString("FAIL: Couldn't create transliterator from generated rules"),
4516                         parseError, status);
4517                 delete t;
4518             } else {
4519                 expect(*t2,
4520                        UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4521                        UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4522                 delete t;
4523                 delete t2;
4524             }
4525         }
4526     }
4527 
4528     // do the same thing for the reversible test case
4529     UParseError parseError;
4530     UErrorCode status = U_ZERO_ERROR;
4531     Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4532             UTRANS_REVERSE, parseError, status);
4533     if (U_FAILURE(status)) {
4534         reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4535     } else {
4536         UnicodeString rules;
4537         reversed->toRules(rules, false);
4538         Transliterator* reversed2 = Transliterator::createFromRules("Reversed", rules, UTRANS_FORWARD,
4539                 parseError, status);
4540         if (U_FAILURE(status)) {
4541             reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator from generated rules"),
4542                     parseError, status);
4543             delete reversed;
4544         } else {
4545             expect(*reversed2,
4546                    UnicodeString("xy XY XYZ yz YZ"),
4547                    UnicodeString("xy abc xaba yz aba"));
4548             delete reversed;
4549             delete reversed2;
4550         }
4551     }
4552 }
4553 
TestRegisterAlias()4554 void TransliteratorTest::TestRegisterAlias() {
4555     UnicodeString longID("Lower;[aeiou]Upper");
4556     UnicodeString shortID("Any-CapVowels");
4557     UnicodeString reallyShortID("CapVowels");
4558 
4559     Transliterator::registerAlias(shortID, longID);
4560 
4561     UErrorCode err = U_ZERO_ERROR;
4562     Transliterator* t1 = Transliterator::createInstance(longID, UTRANS_FORWARD, err);
4563     if (U_FAILURE(err)) {
4564         errln("Failed to instantiate transliterator with long ID");
4565         Transliterator::unregister(shortID);
4566         return;
4567     }
4568     Transliterator* t2 = Transliterator::createInstance(reallyShortID, UTRANS_FORWARD, err);
4569     if (U_FAILURE(err)) {
4570         errln("Failed to instantiate transliterator with short ID");
4571         delete t1;
4572         Transliterator::unregister(shortID);
4573         return;
4574     }
4575 
4576     if (t1->getID() != longID)
4577         errln("Transliterator instantiated with long ID doesn't have long ID");
4578     if (t2->getID() != reallyShortID)
4579         errln("Transliterator instantiated with short ID doesn't have short ID");
4580 
4581     UnicodeString rules1;
4582     UnicodeString rules2;
4583 
4584     t1->toRules(rules1, true);
4585     t2->toRules(rules2, true);
4586     if (rules1 != rules2)
4587         errln("Alias transliterators aren't the same");
4588 
4589     delete t1;
4590     delete t2;
4591     Transliterator::unregister(shortID);
4592 
4593     t1 = Transliterator::createInstance(shortID, UTRANS_FORWARD, err);
4594     if (U_SUCCESS(err)) {
4595         errln("Instantiation with short ID succeeded after short ID was unregistered");
4596         delete t1;
4597     }
4598 
4599     // try the same thing again, but this time with something other than
4600     // an instance of CompoundTransliterator
4601     UnicodeString realID("Latin-Greek");
4602     UnicodeString fakeID("Latin-dlgkjdflkjdl");
4603     Transliterator::registerAlias(fakeID, realID);
4604 
4605     err = U_ZERO_ERROR;
4606     t1 = Transliterator::createInstance(realID, UTRANS_FORWARD, err);
4607     if (U_FAILURE(err)) {
4608         dataerrln("Failed to instantiate transliterator with real ID - %s", u_errorName(err));
4609         Transliterator::unregister(realID);
4610         return;
4611     }
4612     t2 = Transliterator::createInstance(fakeID, UTRANS_FORWARD, err);
4613     if (U_FAILURE(err)) {
4614         errln("Failed to instantiate transliterator with fake ID");
4615         delete t1;
4616         Transliterator::unregister(realID);
4617         return;
4618     }
4619 
4620     t1->toRules(rules1, true);
4621     t2->toRules(rules2, true);
4622     if (rules1 != rules2)
4623         errln("Alias transliterators aren't the same");
4624 
4625     delete t1;
4626     delete t2;
4627     Transliterator::unregister(fakeID);
4628 }
4629 
TestRuleStripping()4630 void TransliteratorTest::TestRuleStripping() {
4631     /*
4632 #
4633 \uE001>\u0C01; # SIGN
4634     */
4635     static const char16_t rule[] = {
4636         0x0023,0x0020,0x000D,0x000A,
4637         0xE001,0x003E,0x0C01,0x003B,0x0020,0x0023,0x0020,0x0053,0x0049,0x0047,0x004E,0
4638     };
4639     static const char16_t expectedRule[] = {
4640         0xE001,0x003E,0x0C01,0x003B,0
4641     };
4642     char16_t result[UPRV_LENGTHOF(rule)];
4643     UErrorCode status = U_ZERO_ERROR;
4644     int32_t len = utrans_stripRules(rule, UPRV_LENGTHOF(rule), result, &status);
4645     if (len != u_strlen(expectedRule)) {
4646         errln("utrans_stripRules return len = %d", len);
4647     }
4648     if (u_strncmp(expectedRule, result, len) != 0) {
4649         errln("utrans_stripRules did not return expected string");
4650     }
4651 }
4652 
4653 /**
4654  * Test the Halfwidth-Fullwidth transliterator (ticket 6281).
4655  */
TestHalfwidthFullwidth()4656 void TransliteratorTest::TestHalfwidthFullwidth() {
4657     UParseError parseError;
4658     UErrorCode status = U_ZERO_ERROR;
4659     Transliterator* hf = Transliterator::createInstance("Halfwidth-Fullwidth", UTRANS_FORWARD, parseError, status);
4660     Transliterator* fh = Transliterator::createInstance("Fullwidth-Halfwidth", UTRANS_FORWARD, parseError, status);
4661     if (hf == 0 || fh == 0) {
4662         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4663         delete hf;
4664         delete fh;
4665         return;
4666     }
4667 
4668     // Array of 2n items
4669     // Each item is
4670     //   "hf"|"fh"|"both",
4671     //   <Halfwidth>,
4672     //   <Fullwidth>
4673     const char* DATA[] = {
4674         "both",
4675         "\\uFFE9\\uFFEA\\uFFEB\\uFFEC\\u0061\\uFF71\\u00AF\\u0020",
4676         "\\u2190\\u2191\\u2192\\u2193\\uFF41\\u30A2\\uFFE3\\u3000",
4677     };
4678     int32_t DATA_length = UPRV_LENGTHOF(DATA);
4679 
4680     for (int32_t i=0; i<DATA_length; i+=3) {
4681         UnicodeString h = CharsToUnicodeString(DATA[i+1]);
4682         UnicodeString f = CharsToUnicodeString(DATA[i+2]);
4683         switch (*DATA[i]) {
4684         case 0x68: //'h': // Halfwidth-Fullwidth only
4685             expect(*hf, h, f);
4686             break;
4687         case 0x66: //'f': // Fullwidth-Halfwidth only
4688             expect(*fh, f, h);
4689             break;
4690         case 0x62: //'b': // both directions
4691             expect(*hf, h, f);
4692             expect(*fh, f, h);
4693             break;
4694         }
4695     }
4696     delete hf;
4697     delete fh;
4698 }
4699 
4700 
4701     /**
4702      *  Test Thai.  The text is the first paragraph of "What is Unicode" from the Unicode.org web site.
4703      *              TODO: confirm that the expected results are correct.
4704      *              For now, test just confirms that C++ and Java give identical results.
4705      */
TestThai()4706 void TransliteratorTest::TestThai() {
4707 #if !UCONFIG_NO_BREAK_ITERATION
4708     // The expectations in this test heavily depends on the Thai dictionary.
4709     // Therefore, we skip this test under the LSTM configuration.
4710     if (skipDictionaryTest()) {
4711         return;
4712     }
4713     UParseError parseError;
4714     UErrorCode status = U_ZERO_ERROR;
4715     Transliterator* tr = Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
4716     if (tr == 0) {
4717         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4718         return;
4719     }
4720     if (U_FAILURE(status)) {
4721         errln("FAIL: createInstance failed with %s", u_errorName(status));
4722         return;
4723     }
4724     const char *thaiText =
4725         "\\u0e42\\u0e14\\u0e22\\u0e1e\\u0e37\\u0e49\\u0e19\\u0e10\\u0e32\\u0e19\\u0e41\\u0e25\\u0e49\\u0e27, \\u0e04\\u0e2d"
4726         "\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d\\u0e23\\u0e4c\\u0e08\\u0e30\\u0e40\\u0e01\\u0e35\\u0e48\\u0e22"
4727         "\\u0e27\\u0e02\\u0e49\\u0e2d\\u0e07\\u0e01\\u0e31\\u0e1a\\u0e40\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e02\\u0e2d"
4728         "\\u0e07\\u0e15\\u0e31\\u0e27\\u0e40\\u0e25\\u0e02. \\u0e04\\u0e2d\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d"
4729         "\\u0e23\\u0e4c\\u0e08\\u0e31\\u0e14\\u0e40\\u0e01\\u0e47\\u0e1a\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29"
4730         "\\u0e23\\u0e41\\u0e25\\u0e30\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30\\u0e2d\\u0e37\\u0e48\\u0e19\\u0e46 \\u0e42"
4731         "\\u0e14\\u0e22\\u0e01\\u0e32\\u0e23\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25"
4732         "\\u0e02\\u0e43\\u0e2b\\u0e49\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e41\\u0e15\\u0e48\\u0e25\\u0e30\\u0e15"
4733         "\\u0e31\\u0e27. \\u0e01\\u0e48\\u0e2d\\u0e19\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48\\u0e4a Unicode \\u0e08"
4734         "\\u0e30\\u0e16\\u0e39\\u0e01\\u0e2a\\u0e23\\u0e49\\u0e32\\u0e07\\u0e02\\u0e36\\u0e49\\u0e19, \\u0e44\\u0e14\\u0e49"
4735         "\\u0e21\\u0e35\\u0e23\\u0e30\\u0e1a\\u0e1a encoding \\u0e2d\\u0e22\\u0e39\\u0e48\\u0e2b\\u0e25\\u0e32\\u0e22\\u0e23"
4736         "\\u0e49\\u0e2d\\u0e22\\u0e23\\u0e30\\u0e1a\\u0e1a\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e01\\u0e32\\u0e23"
4737         "\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25\\u0e02\\u0e40\\u0e2b\\u0e25\\u0e48"
4738         "\\u0e32\\u0e19\\u0e35\\u0e49. \\u0e44\\u0e21\\u0e48\\u0e21\\u0e35 encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48"
4739         "\\u0e21\\u0e35\\u0e08\\u0e33\\u0e19\\u0e27\\u0e19\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30"
4740         "\\u0e21\\u0e32\\u0e01\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d: \\u0e22\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d"
4741         "\\u0e22\\u0e48\\u0e32\\u0e07\\u0e40\\u0e0a\\u0e48\\u0e19, \\u0e40\\u0e09\\u0e1e\\u0e32\\u0e30\\u0e43\\u0e19\\u0e01"
4742         "\\u0e25\\u0e38\\u0e48\\u0e21\\u0e2a\\u0e2b\\u0e20\\u0e32\\u0e1e\\u0e22\\u0e38\\u0e42\\u0e23\\u0e1b\\u0e40\\u0e1e"
4743         "\\u0e35\\u0e22\\u0e07\\u0e41\\u0e2b\\u0e48\\u0e07\\u0e40\\u0e14\\u0e35\\u0e22\\u0e27 \\u0e01\\u0e47\\u0e15\\u0e49"
4744         "\\u0e2d\\u0e07\\u0e01\\u0e32\\u0e23\\u0e2b\\u0e25\\u0e32\\u0e22 encoding \\u0e43\\u0e19\\u0e01\\u0e32\\u0e23\\u0e04"
4745         "\\u0e23\\u0e2d\\u0e1a\\u0e04\\u0e25\\u0e38\\u0e21\\u0e17\\u0e38\\u0e01\\u0e20\\u0e32\\u0e29\\u0e32\\u0e43\\u0e19"
4746         "\\u0e01\\u0e25\\u0e38\\u0e48\\u0e21. \\u0e2b\\u0e23\\u0e37\\u0e2d\\u0e41\\u0e21\\u0e49\\u0e41\\u0e15\\u0e48\\u0e43"
4747         "\\u0e19\\u0e20\\u0e32\\u0e29\\u0e32\\u0e40\\u0e14\\u0e35\\u0e48\\u0e22\\u0e27 \\u0e40\\u0e0a\\u0e48\\u0e19 \\u0e20"
4748         "\\u0e32\\u0e29\\u0e32\\u0e2d\\u0e31\\u0e07\\u0e01\\u0e24\\u0e29 \\u0e01\\u0e47\\u0e44\\u0e21\\u0e48\\u0e21\\u0e35"
4749         " encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d\\u0e2a\\u0e33\\u0e2b"
4750         "\\u0e23\\u0e31\\u0e1a\\u0e17\\u0e38\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29\\u0e23, \\u0e40\\u0e04"
4751         "\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e27\\u0e23\\u0e23\\u0e04\\u0e15\\u0e2d\\u0e19"
4752         " \\u0e41\\u0e25\\u0e30\\u0e2a\\u0e31\\u0e0d\\u0e25\\u0e31\\u0e01\\u0e29\\u0e13\\u0e4c\\u0e17\\u0e32\\u0e07\\u0e40"
4753         "\\u0e17\\u0e04\\u0e19\\u0e34\\u0e04\\u0e17\\u0e35\\u0e48\\u0e43\\u0e0a\\u0e49\\u0e01\\u0e31\\u0e19\\u0e2d\\u0e22"
4754         "\\u0e39\\u0e48\\u0e17\\u0e31\\u0e48\\u0e27\\u0e44\\u0e1b.";
4755 
4756     const char *latinText =
4757         "doy ph\\u1ee5\\u0304\\u0302n \\u1e6d\\u0304h\\u0101n l\\u00e6\\u0302w, khxmphiwtexr\\u0312 ca ke\\u012b\\u0300"
4758         "ywk\\u0304\\u0125xng k\\u1ea1b re\\u1ee5\\u0304\\u0300xng k\\u0304hxng t\\u1ea1wlek\\u0304h. khxmphiwtexr"
4759         "\\u0312 c\\u1ea1d k\\u0115b t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r l\\u00e6a x\\u1ea1kk\\u0304h ra x\\u1ee5\\u0304"
4760         "\\u0300n\\u00ab doy k\\u0101r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304\\u0131\\u0302 s\\u0304"
4761         "\\u1ea3h\\u0304r\\u1ea1b t\\u00e6\\u0300la t\\u1ea1w. k\\u0300xn h\\u0304n\\u0302\\u0101 th\\u012b\\u0300\\u0301"
4762         " Unicode ca t\\u0304h\\u016bk s\\u0304r\\u0302\\u0101ng k\\u0304h\\u1ee5\\u0302n, d\\u1ecb\\u0302 m\\u012b "
4763         "rabb encoding xy\\u016b\\u0300 h\\u0304l\\u0101y r\\u0302xy rabb s\\u0304\\u1ea3h\\u0304r\\u1ea1b k\\u0101"
4764         "r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304el\\u0300\\u0101 n\\u012b\\u0302. m\\u1ecb\\u0300m"
4765         "\\u012b encoding d\\u0131 th\\u012b\\u0300 m\\u012b c\\u1ea3nwn t\\u1ea1w x\\u1ea1kk\\u0304hra m\\u0101k p"
4766         "he\\u012byng phx: yk t\\u1ea1wx\\u1ef3\\u0101ng ch\\u00e8n, c\\u0304heph\\u0101a n\\u0131 kl\\u00f9m s\\u0304"
4767         "h\\u0304p\\u0323h\\u0101ph yurop phe\\u012byng h\\u0304\\u00e6\\u0300ng de\\u012byw k\\u0306 t\\u0302xngk\\u0101"
4768         "r h\\u0304l\\u0101y encoding n\\u0131 k\\u0101r khrxbkhlum thuk p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 n\\u0131"
4769         " kl\\u00f9m. h\\u0304r\\u1ee5\\u0304x m\\u00e6\\u0302t\\u00e6\\u0300 n\\u0131 p\\u0323h\\u0101s\\u0304\\u02b9"
4770         "\\u0101 de\\u012b\\u0300yw ch\\u00e8n p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 x\\u1ea1ngkvs\\u0304\\u02b9 k\\u0306"
4771         " m\\u1ecb\\u0300m\\u012b encoding d\\u0131 th\\u012b\\u0300 phe\\u012byng phx s\\u0304\\u1ea3h\\u0304r\\u1ea1"
4772         "b thuk t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r, kher\\u1ee5\\u0304\\u0300xngh\\u0304m\\u0101y wrrkh txn l\\u00e6"
4773         "a s\\u0304\\u1ea1\\u1ef5l\\u1ea1ks\\u0304\\u02b9\\u1e47\\u0312 th\\u0101ng thekhnikh th\\u012b\\u0300 ch\\u0131"
4774         "\\u0302 k\\u1ea1n xy\\u016b\\u0300 th\\u1ea1\\u0300wp\\u1ecb.";
4775 
4776 
4777     UnicodeString  xlitText(thaiText);
4778     xlitText = xlitText.unescape();
4779     tr->transliterate(xlitText);
4780 
4781     UnicodeString expectedText(latinText);
4782     expectedText = expectedText.unescape();
4783     expect(*tr, xlitText, expectedText);
4784 
4785     delete tr;
4786 #endif
4787 }
4788 
4789 
4790 //======================================================================
4791 // Support methods
4792 //======================================================================
expectT(const UnicodeString & id,const UnicodeString & source,const UnicodeString & expectedResult)4793 void TransliteratorTest::expectT(const UnicodeString& id,
4794                                  const UnicodeString& source,
4795                                  const UnicodeString& expectedResult) {
4796     UErrorCode ec = U_ZERO_ERROR;
4797     UParseError pe;
4798     Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
4799     if (U_FAILURE(ec)) {
4800         errln((UnicodeString)"FAIL: Could not create " + id + " -  " + u_errorName(ec));
4801         delete t;
4802         return;
4803     }
4804     expect(*t, source, expectedResult);
4805     delete t;
4806 }
4807 
reportParseError(const UnicodeString & message,const UParseError & parseError,const UErrorCode & status)4808 void TransliteratorTest::reportParseError(const UnicodeString& message,
4809                                           const UParseError& parseError,
4810                                           const UErrorCode& status) {
4811     dataerrln(message +
4812           /*", parse error " + parseError.code +*/
4813           ", line " + parseError.line +
4814           ", offset " + parseError.offset +
4815           ", pre-context " + prettify(parseError.preContext, true) +
4816           ", post-context " + prettify(parseError.postContext,true) +
4817           ", Error: " + u_errorName(status));
4818 }
4819 
expect(const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4820 void TransliteratorTest::expect(const UnicodeString& rules,
4821                                 const UnicodeString& source,
4822                                 const UnicodeString& expectedResult,
4823                                 UTransPosition *pos) {
4824     expect("<ID>", rules, source, expectedResult, pos);
4825 }
4826 
expect(const UnicodeString & id,const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4827 void TransliteratorTest::expect(const UnicodeString& id,
4828                                 const UnicodeString& rules,
4829                                 const UnicodeString& source,
4830                                 const UnicodeString& expectedResult,
4831                                 UTransPosition *pos) {
4832     UErrorCode status = U_ZERO_ERROR;
4833     UParseError parseError;
4834     Transliterator* t = Transliterator::createFromRules(id, rules, UTRANS_FORWARD, parseError, status);
4835     if (U_FAILURE(status)) {
4836         reportParseError(UnicodeString("Couldn't create transliterator from ") + rules, parseError, status);
4837     } else {
4838         expect(*t, source, expectedResult, pos);
4839     }
4840     delete t;
4841 }
4842 
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,const Transliterator & reverseTransliterator)4843 void TransliteratorTest::expect(const Transliterator& t,
4844                                 const UnicodeString& source,
4845                                 const UnicodeString& expectedResult,
4846                                 const Transliterator& reverseTransliterator) {
4847     expect(t, source, expectedResult);
4848     expect(reverseTransliterator, expectedResult, source);
4849 }
4850 
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4851 void TransliteratorTest::expect(const Transliterator& t,
4852                                 const UnicodeString& source,
4853                                 const UnicodeString& expectedResult,
4854                                 UTransPosition *pos) {
4855     if (pos == 0) {
4856         UnicodeString result(source);
4857         t.transliterate(result);
4858         expectAux(t.getID() + ":String", source, result, expectedResult);
4859     }
4860     UTransPosition index={0, 0, 0, 0};
4861     if (pos != 0) {
4862         index = *pos;
4863     }
4864 
4865     UnicodeString rsource(source);
4866     if (pos == 0) {
4867         t.transliterate(rsource);
4868     } else {
4869         // Do it all at once -- below we do it incrementally
4870         t.finishTransliteration(rsource, *pos);
4871     }
4872     expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
4873 
4874     // Test keyboard (incremental) transliteration -- this result
4875     // must be the same after we finalize (see below).
4876     UnicodeString log;
4877     rsource.remove();
4878     if (pos != 0) {
4879         rsource = source;
4880         formatInput(log, rsource, index);
4881         log.append(" -> ");
4882         UErrorCode status = U_ZERO_ERROR;
4883         t.transliterate(rsource, index, status);
4884         formatInput(log, rsource, index);
4885     } else {
4886         for (int32_t i=0; i<source.length(); ++i) {
4887             if (i != 0) {
4888                 log.append(" + ");
4889             }
4890             log.append(source.charAt(i)).append(" -> ");
4891             UErrorCode status = U_ZERO_ERROR;
4892             t.transliterate(rsource, index, source.charAt(i), status);
4893             formatInput(log, rsource, index);
4894         }
4895     }
4896 
4897     // As a final step in keyboard transliteration, we must call
4898     // transliterate to finish off any pending partial matches that
4899     // were waiting for more input.
4900     t.finishTransliteration(rsource, index);
4901     log.append(" => ").append(rsource);
4902 
4903     expectAux(t.getID() + ":Keyboard", log,
4904               rsource == expectedResult,
4905               expectedResult);
4906 }
4907 
4908 
4909 /**
4910  * @param appendTo result is appended to this param.
4911  * @param input the string being transliterated
4912  * @param pos the index struct
4913  */
formatInput(UnicodeString & appendTo,const UnicodeString & input,const UTransPosition & pos)4914 UnicodeString& TransliteratorTest::formatInput(UnicodeString &appendTo,
4915                                                const UnicodeString& input,
4916                                                const UTransPosition& pos) {
4917     // Output a string of the form aaa{bbb|ccc|ddd}eee, where
4918     // the {} indicate the context start and limit, and the ||
4919     // indicate the start and limit.
4920     if (0 <= pos.contextStart &&
4921         pos.contextStart <= pos.start &&
4922         pos.start <= pos.limit &&
4923         pos.limit <= pos.contextLimit &&
4924         pos.contextLimit <= input.length()) {
4925 
4926         UnicodeString a, b, c, d, e;
4927         input.extractBetween(0, pos.contextStart, a);
4928         input.extractBetween(pos.contextStart, pos.start, b);
4929         input.extractBetween(pos.start, pos.limit, c);
4930         input.extractBetween(pos.limit, pos.contextLimit, d);
4931         input.extractBetween(pos.contextLimit, input.length(), e);
4932         appendTo.append(a).append((char16_t)123/*{*/).append(b).
4933             append((char16_t)PIPE).append(c).append((char16_t)PIPE).append(d).
4934             append((char16_t)125/*}*/).append(e);
4935     } else {
4936         appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
4937                         pos.contextStart + ", s=" + pos.start + ", l=" +
4938                         pos.limit + ", cl=" + pos.contextLimit + "} on " +
4939                         input);
4940     }
4941     return appendTo;
4942 }
4943 
expectAux(const UnicodeString & tag,const UnicodeString & source,const UnicodeString & result,const UnicodeString & expectedResult)4944 void TransliteratorTest::expectAux(const UnicodeString& tag,
4945                                    const UnicodeString& source,
4946                                    const UnicodeString& result,
4947                                    const UnicodeString& expectedResult) {
4948     expectAux(tag, source + " -> " + result,
4949               result == expectedResult,
4950               expectedResult);
4951 }
4952 
expectAux(const UnicodeString & tag,const UnicodeString & summary,UBool pass,const UnicodeString & expectedResult)4953 void TransliteratorTest::expectAux(const UnicodeString& tag,
4954                                    const UnicodeString& summary, UBool pass,
4955                                    const UnicodeString& expectedResult) {
4956     if (pass) {
4957         logln(UnicodeString("(")+tag+") " + prettify(summary));
4958     } else {
4959         dataerrln(UnicodeString("FAIL: (")+tag+") "
4960               + prettify(summary)
4961               + ", expected " + prettify(expectedResult));
4962     }
4963 }
4964 
4965 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
4966