1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1999-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 11/10/99 aliu Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "transtst.h"
18 #include "unicode/locid.h"
19 #include "unicode/dtfmtsym.h"
20 #include "unicode/normlzr.h"
21 #include "unicode/translit.h"
22 #include "unicode/uchar.h"
23 #include "unicode/unifilt.h"
24 #include "unicode/uniset.h"
25 #include "unicode/ustring.h"
26 #include "unicode/usetiter.h"
27 #include "unicode/uscript.h"
28 #include "unicode/utf16.h"
29 #include "cpdtrans.h"
30 #include "nultrans.h"
31 #include "rbt.h"
32 #include "rbt_pars.h"
33 #include "anytrans.h"
34 #include "esctrn.h"
35 #include "name2uni.h"
36 #include "nortrans.h"
37 #include "remtrans.h"
38 #include "titletrn.h"
39 #include "tolowtrn.h"
40 #include "toupptrn.h"
41 #include "unesctrn.h"
42 #include "uni2name.h"
43 #include "cstring.h"
44 #include "cmemory.h"
45 #include <stdio.h>
46
47 /***********************************************************************
48
49 HOW TO USE THIS TEST FILE
50 -or-
51 How I developed on two platforms
52 without losing (too much of) my mind
53
54
55 1. Add new tests by copying/pasting/changing existing tests. On Java,
56 any public void method named Test...() taking no parameters becomes
57 a test. On C++, you need to modify the header and add a line to
58 the runIndexedTest() dispatch method.
59
60 2. Make liberal use of the expect() method; it is your friend.
61
62 3. The tests in this file exactly match those in a sister file on the
63 other side. The two files are:
64
65 icu4j: src/com/ibm/test/translit/TransliteratorTest.java
66 icu4c: source/test/intltest/transtst.cpp
67
68 ==> THIS IS THE IMPORTANT PART <==
69
70 When you add a test in this file, add it in TransliteratorTest.java
71 too. Give it the same name and put it in the same relative place.
72 This makes maintenance a lot simpler for any poor soul who ends up
73 trying to synchronize the tests between icu4j and icu4c.
74
75 4. If you MUST enter a test that is NOT paralleled in the sister file,
76 then add it in the special non-mirrored section. These are
77 labeled
78
79 "icu4j ONLY"
80
81 or
82
83 "icu4c ONLY"
84
85 Make sure you document the reason the test is here and not there.
86
87
88 Thank you.
89 The Management
90 ***********************************************************************/
91
92 // Define character constants thusly to be EBCDIC-friendly
93 enum {
94 LEFT_BRACE=((char16_t)0x007B), /*{*/
95 PIPE =((char16_t)0x007C), /*|*/
96 ZERO =((char16_t)0x0030), /*0*/
97 UPPER_A =((char16_t)0x0041) /*A*/
98 };
99
TransliteratorTest()100 TransliteratorTest::TransliteratorTest()
101 : DESERET_DEE((UChar32)0x10414),
102 DESERET_dee((UChar32)0x1043C)
103 {
104 }
105
~TransliteratorTest()106 TransliteratorTest::~TransliteratorTest() {}
107
108 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)109 TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
110 const char* &name, char* /*par*/) {
111 switch (index) {
112 TESTCASE(0,TestInstantiation);
113 TESTCASE(1,TestSimpleRules);
114 TESTCASE(2,TestRuleBasedInverse);
115 TESTCASE(3,TestKeyboard);
116 TESTCASE(4,TestKeyboard2);
117 TESTCASE(5,TestKeyboard3);
118 TESTCASE(6,TestArabic);
119 TESTCASE(7,TestCompoundKana);
120 TESTCASE(8,TestCompoundHex);
121 TESTCASE(9,TestFiltering);
122 TESTCASE(10,TestInlineSet);
123 TESTCASE(11,TestPatternQuoting);
124 TESTCASE(12,TestJ277);
125 TESTCASE(13,TestJ243);
126 TESTCASE(14,TestJ329);
127 TESTCASE(15,TestSegments);
128 TESTCASE(16,TestCursorOffset);
129 TESTCASE(17,TestArbitraryVariableValues);
130 TESTCASE(18,TestPositionHandling);
131 TESTCASE(19,TestHiraganaKatakana);
132 TESTCASE(20,TestCopyJ476);
133 TESTCASE(21,TestAnchors);
134 TESTCASE(22,TestInterIndic);
135 TESTCASE(23,TestFilterIDs);
136 TESTCASE(24,TestCaseMap);
137 TESTCASE(25,TestNameMap);
138 TESTCASE(26,TestLiberalizedID);
139 TESTCASE(27,TestCreateInstance);
140 TESTCASE(28,TestNormalizationTransliterator);
141 TESTCASE(29,TestCompoundRBT);
142 TESTCASE(30,TestCompoundFilter);
143 TESTCASE(31,TestRemove);
144 TESTCASE(32,TestToRules);
145 TESTCASE(33,TestContext);
146 TESTCASE(34,TestSupplemental);
147 TESTCASE(35,TestQuantifier);
148 TESTCASE(36,TestSTV);
149 TESTCASE(37,TestCompoundInverse);
150 TESTCASE(38,TestNFDChainRBT);
151 TESTCASE(39,TestNullInverse);
152 TESTCASE(40,TestAliasInverseID);
153 TESTCASE(41,TestCompoundInverseID);
154 TESTCASE(42,TestUndefinedVariable);
155 TESTCASE(43,TestEmptyContext);
156 TESTCASE(44,TestCompoundFilterID);
157 TESTCASE(45,TestPropertySet);
158 TESTCASE(46,TestNewEngine);
159 TESTCASE(47,TestQuantifiedSegment);
160 TESTCASE(48,TestDevanagariLatinRT);
161 TESTCASE(49,TestTeluguLatinRT);
162 TESTCASE(50,TestCompoundLatinRT);
163 TESTCASE(51,TestSanskritLatinRT);
164 TESTCASE(52,TestLocaleInstantiation);
165 TESTCASE(53,TestTitleAccents);
166 TESTCASE(54,TestLocaleResource);
167 TESTCASE(55,TestParseError);
168 TESTCASE(56,TestOutputSet);
169 TESTCASE(57,TestVariableRange);
170 TESTCASE(58,TestInvalidPostContext);
171 TESTCASE(59,TestIDForms);
172 TESTCASE(60,TestToRulesMark);
173 TESTCASE(61,TestEscape);
174 TESTCASE(62,TestAnchorMasking);
175 TESTCASE(63,TestDisplayName);
176 TESTCASE(64,TestSpecialCases);
177 #if !UCONFIG_NO_FILE_IO
178 TESTCASE(65,TestIncrementalProgress);
179 #endif
180 TESTCASE(66,TestSurrogateCasing);
181 TESTCASE(67,TestFunction);
182 TESTCASE(68,TestInvalidBackRef);
183 TESTCASE(69,TestMulticharStringSet);
184 TESTCASE(70,TestUserFunction);
185 TESTCASE(71,TestAnyX);
186 TESTCASE(72,TestSourceTargetSet);
187 TESTCASE(73,TestGurmukhiDevanagari);
188 TESTCASE(74,TestPatternWhiteSpace);
189 TESTCASE(75,TestAllCodepoints);
190 TESTCASE(76,TestBoilerplate);
191 TESTCASE(77,TestAlternateSyntax);
192 TESTCASE(78,TestBeginEnd);
193 TESTCASE(79,TestBeginEndToRules);
194 TESTCASE(80,TestRegisterAlias);
195 TESTCASE(81,TestRuleStripping);
196 TESTCASE(82,TestHalfwidthFullwidth);
197 TESTCASE(83,TestThai);
198 TESTCASE(84,TestAny);
199 TESTCASE(85,TestBasicTransliteratorEvenWithoutData);
200 default: name = ""; break;
201 }
202 }
203
204 /**
205 * Make sure every system transliterator can be instantiated.
206 *
207 * ALSO test that the result of toRules() for each rule is a valid
208 * rule. Do this here so we don't have to have another test that
209 * instantiates everything as well.
210 */
TestInstantiation()211 void TransliteratorTest::TestInstantiation() {
212 UErrorCode ec = U_ZERO_ERROR;
213 StringEnumeration* avail = Transliterator::getAvailableIDs(ec);
214 assertSuccess("getAvailableIDs()", ec);
215 assertTrue("getAvailableIDs()!=nullptr", avail!=nullptr);
216 int32_t n = Transliterator::countAvailableIDs();
217 assertTrue("getAvailableIDs().count()==countAvailableIDs()",
218 avail->count(ec) == n);
219 assertSuccess("count()", ec);
220 UnicodeString name;
221 for (int32_t i=0; i<n; ++i) {
222 const UnicodeString& id = *avail->snext(ec);
223 if (!assertSuccess("snext()", ec) ||
224 !assertTrue("snext()!=nullptr", (&id)!=nullptr, true)) {
225 break;
226 }
227 UnicodeString id2 = Transliterator::getAvailableID(i);
228 if (id.length() < 1) {
229 errln(UnicodeString("FAIL: getAvailableID(") +
230 i + ") returned empty string");
231 continue;
232 }
233 if (id != id2) {
234 errln(UnicodeString("FAIL: getAvailableID(") +
235 i + ") != getAvailableIDs().snext()");
236 continue;
237 }
238 UParseError parseError;
239 UErrorCode status = U_ZERO_ERROR;
240 Transliterator* t = Transliterator::createInstance(id,
241 UTRANS_FORWARD, parseError,status);
242 name.truncate(0);
243 Transliterator::getDisplayName(id, name);
244 if (t == 0) {
245 #if UCONFIG_NO_BREAK_ITERATION
246 // If UCONFIG_NO_BREAK_ITERATION is on, then only Thai should fail.
247 if (id.compare((UnicodeString)"Thai-Latn") != 0 &&
248 id.compare((UnicodeString)"Thai-Latin") != 0)
249 #endif
250 dataerrln(UnicodeString("FAIL: Couldn't create ") + id +
251 /*", parse error " + parseError.code +*/
252 ", line " + parseError.line +
253 ", offset " + parseError.offset +
254 ", pre-context " + prettify(parseError.preContext, true) +
255 ", post-context " +prettify(parseError.postContext,true) +
256 ", Error: " + u_errorName(status));
257 // When createInstance fails, it deletes the failing
258 // entry from the available ID list. We detect this
259 // here by looking for a change in countAvailableIDs.
260 int32_t nn = Transliterator::countAvailableIDs();
261 if (nn == (n - 1)) {
262 n = nn;
263 --i; // Compensate for deleted entry
264 }
265 } else {
266 logln(UnicodeString("OK: ") + name + " (" + id + ")");
267
268 // Now test toRules
269 UnicodeString rules;
270 t->toRules(rules, true);
271 Transliterator *u = Transliterator::createFromRules("x",
272 rules, UTRANS_FORWARD, parseError,status);
273 if (u == 0) {
274 errln(UnicodeString("FAIL: ") + id +
275 ".createFromRules() => bad rules" +
276 /*", parse error " + parseError.code +*/
277 ", line " + parseError.line +
278 ", offset " + parseError.offset +
279 ", context " + prettify(parseError.preContext, true) +
280 ", rules: " + prettify(rules, true));
281 } else {
282 delete u;
283 }
284 delete t;
285 }
286 }
287 assertTrue("snext()==nullptr", avail->snext(ec)==nullptr);
288 assertSuccess("snext()", ec);
289 delete avail;
290
291 // Now test the failure path
292 UParseError parseError;
293 UErrorCode status = U_ZERO_ERROR;
294 UnicodeString id("<Not a valid Transliterator ID>");
295 Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
296 if (t != 0) {
297 errln("FAIL: " + id + " returned a transliterator");
298 delete t;
299 } else {
300 logln("OK: Bogus ID handled properly");
301 }
302 }
303
TestSimpleRules()304 void TransliteratorTest::TestSimpleRules() {
305 /* Example: rules 1. ab>x|y
306 * 2. yc>z
307 *
308 * []|eabcd start - no match, copy e to translated buffer
309 * [e]|abcd match rule 1 - copy output & adjust cursor
310 * [ex|y]cd match rule 2 - copy output & adjust cursor
311 * [exz]|d no match, copy d to transliterated buffer
312 * [exzd]| done
313 */
314 expect(UnicodeString("ab>x|y;", "") +
315 "yc>z",
316 "eabcd", "exzd");
317
318 /* Another set of rules:
319 * 1. ab>x|yzacw
320 * 2. za>q
321 * 3. qc>r
322 * 4. cw>n
323 *
324 * []|ab Rule 1
325 * [x|yzacw] No match
326 * [xy|zacw] Rule 2
327 * [xyq|cw] Rule 4
328 * [xyqn]| Done
329 */
330 expect(UnicodeString("ab>x|yzacw;") +
331 "za>q;" +
332 "qc>r;" +
333 "cw>n",
334 "ab", "xyqn");
335
336 /* Test categories
337 */
338 UErrorCode status = U_ZERO_ERROR;
339 UParseError parseError;
340 Transliterator *t = Transliterator::createFromRules(
341 "<ID>",
342 UnicodeString("$dummy=").append((char16_t)0xE100) +
343 UnicodeString(";"
344 "$vowel=[aeiouAEIOU];"
345 "$lu=[:Lu:];"
346 "$vowel } $lu > '!';"
347 "$vowel > '&';"
348 "'!' { $lu > '^';"
349 "$lu > '*';"
350 "a > ERROR", ""),
351 UTRANS_FORWARD, parseError,
352 status);
353 if (U_FAILURE(status)) {
354 dataerrln("FAIL: RBT constructor failed - %s", u_errorName(status));
355 return;
356 }
357 expect(*t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
358 delete t;
359 }
360
361 /**
362 * Test inline set syntax and set variable syntax.
363 */
TestInlineSet()364 void TransliteratorTest::TestInlineSet() {
365 expect("{ [:Ll:] } x > y; [:Ll:] > z;", "aAbxq", "zAyzz");
366 expect("a[0-9]b > qrs", "1a7b9", "1qrs9");
367
368 expect(UnicodeString(
369 "$digit = [0-9];"
370 "$alpha = [a-zA-Z];"
371 "$alphanumeric = [$digit $alpha];" // ***
372 "$special = [^$alphanumeric];" // ***
373 "$alphanumeric > '-';"
374 "$special > '*';", ""),
375
376 "thx-1138", "---*----");
377 }
378
379 /**
380 * Create some inverses and confirm that they work. We have to be
381 * careful how we do this, since the inverses will not be true
382 * inverses -- we can't throw any random string at the composition
383 * of the transliterators and expect the identity function. F x
384 * F' != I. However, if we are careful about the input, we will
385 * get the expected results.
386 */
TestRuleBasedInverse()387 void TransliteratorTest::TestRuleBasedInverse() {
388 UnicodeString RULES =
389 UnicodeString("abc>zyx;") +
390 "ab>yz;" +
391 "bc>zx;" +
392 "ca>xy;" +
393 "a>x;" +
394 "b>y;" +
395 "c>z;" +
396
397 "abc<zyx;" +
398 "ab<yz;" +
399 "bc<zx;" +
400 "ca<xy;" +
401 "a<x;" +
402 "b<y;" +
403 "c<z;" +
404
405 "";
406
407 const char* DATA[] = {
408 // Careful here -- random strings will not work. If we keep
409 // the left side to the domain and the right side to the range
410 // we will be okay though (left, abc; right xyz).
411 "a", "x",
412 "abcacab", "zyxxxyy",
413 "caccb", "xyzzy",
414 };
415
416 int32_t DATA_length = UPRV_LENGTHOF(DATA);
417
418 UErrorCode status = U_ZERO_ERROR;
419 UParseError parseError;
420 Transliterator *fwd = Transliterator::createFromRules("<ID>", RULES,
421 UTRANS_FORWARD, parseError, status);
422 Transliterator *rev = Transliterator::createFromRules("<ID>", RULES,
423 UTRANS_REVERSE, parseError, status);
424 if (U_FAILURE(status)) {
425 errln("FAIL: RBT constructor failed");
426 return;
427 }
428 for (int32_t i=0; i<DATA_length; i+=2) {
429 expect(*fwd, DATA[i], DATA[i+1]);
430 expect(*rev, DATA[i+1], DATA[i]);
431 }
432 delete fwd;
433 delete rev;
434 }
435
436 /**
437 * Basic test of keyboard.
438 */
TestKeyboard()439 void TransliteratorTest::TestKeyboard() {
440 UParseError parseError;
441 UErrorCode status = U_ZERO_ERROR;
442 Transliterator *t = Transliterator::createFromRules("<ID>",
443 UnicodeString("psch>Y;")
444 +"ps>y;"
445 +"ch>x;"
446 +"a>A;",
447 UTRANS_FORWARD, parseError,
448 status);
449 if (U_FAILURE(status)) {
450 errln("FAIL: RBT constructor failed");
451 return;
452 }
453 const char* DATA[] = {
454 // insertion, buffer
455 "a", "A",
456 "p", "Ap",
457 "s", "Aps",
458 "c", "Apsc",
459 "a", "AycA",
460 "psch", "AycAY",
461 0, "AycAY", // null means finishKeyboardTransliteration
462 };
463
464 keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
465 delete t;
466 }
467
468 /**
469 * Basic test of keyboard with cursor.
470 */
TestKeyboard2()471 void TransliteratorTest::TestKeyboard2() {
472 UParseError parseError;
473 UErrorCode status = U_ZERO_ERROR;
474 Transliterator *t = Transliterator::createFromRules("<ID>",
475 UnicodeString("ych>Y;")
476 +"ps>|y;"
477 +"ch>x;"
478 +"a>A;",
479 UTRANS_FORWARD, parseError,
480 status);
481 if (U_FAILURE(status)) {
482 errln("FAIL: RBT constructor failed");
483 return;
484 }
485 const char* DATA[] = {
486 // insertion, buffer
487 "a", "A",
488 "p", "Ap",
489 "s", "Aps", // modified for rollback - "Ay",
490 "c", "Apsc", // modified for rollback - "Ayc",
491 "a", "AycA",
492 "p", "AycAp",
493 "s", "AycAps", // modified for rollback - "AycAy",
494 "c", "AycApsc", // modified for rollback - "AycAyc",
495 "h", "AycAY",
496 0, "AycAY", // null means finishKeyboardTransliteration
497 };
498
499 keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
500 delete t;
501 }
502
503 /**
504 * Test keyboard transliteration with back-replacement.
505 */
TestKeyboard3()506 void TransliteratorTest::TestKeyboard3() {
507 // We want th>z but t>y. Furthermore, during keyboard
508 // transliteration we want t>y then yh>z if t, then h are
509 // typed.
510 UnicodeString RULES("t>|y;"
511 "yh>z;");
512
513 const char* DATA[] = {
514 // Column 1: characters to add to buffer (as if typed)
515 // Column 2: expected appearance of buffer after
516 // keyboard xliteration.
517 "a", "a",
518 "b", "ab",
519 "t", "abt", // modified for rollback - "aby",
520 "c", "abyc",
521 "t", "abyct", // modified for rollback - "abycy",
522 "h", "abycz",
523 0, "abycz", // null means finishKeyboardTransliteration
524 };
525
526 UParseError parseError;
527 UErrorCode status = U_ZERO_ERROR;
528 Transliterator *t = Transliterator::createFromRules("<ID>", RULES, UTRANS_FORWARD, parseError, status);
529 if (U_FAILURE(status)) {
530 errln("FAIL: RBT constructor failed");
531 return;
532 }
533 keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
534 delete t;
535 }
536
keyboardAux(const Transliterator & t,const char * DATA[],int32_t DATA_length)537 void TransliteratorTest::keyboardAux(const Transliterator& t,
538 const char* DATA[], int32_t DATA_length) {
539 UErrorCode status = U_ZERO_ERROR;
540 UTransPosition index={0, 0, 0, 0};
541 UnicodeString s;
542 for (int32_t i=0; i<DATA_length; i+=2) {
543 UnicodeString log;
544 if (DATA[i] != 0) {
545 log = s + " + "
546 + DATA[i]
547 + " -> ";
548 t.transliterate(s, index, DATA[i], status);
549 } else {
550 log = s + " => ";
551 t.finishTransliteration(s, index);
552 }
553 // Show the start index '{' and the cursor '|'
554 UnicodeString a, b, c;
555 s.extractBetween(0, index.contextStart, a);
556 s.extractBetween(index.contextStart, index.start, b);
557 s.extractBetween(index.start, s.length(), c);
558 log.append(a).
559 append((char16_t)LEFT_BRACE).
560 append(b).
561 append((char16_t)PIPE).
562 append(c);
563 if (s == DATA[i+1] && U_SUCCESS(status)) {
564 logln(log);
565 } else {
566 errln(UnicodeString("FAIL: ") + log + ", expected " + DATA[i+1]);
567 }
568 }
569 }
570
TestArabic()571 void TransliteratorTest::TestArabic() {
572 // Test disabled for 2.0 until new Arabic transliterator can be written.
573 // /*
574 // const char* DATA[] = {
575 // "Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+
576 // "\u0627\u0644\u0644\u063a\u0629\u0020"+
577 // "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
578 // "\u0628\u0628\u0646\u0638\u0645\u0020"+
579 // "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
580 // "\u062c\u0645\u064a\u0644\u0629",
581 // };
582 // */
583 //
584 // char16_t ar_raw[] = {
585 // 0x062a, 0x062a, 0x0645, 0x062a, 0x0639, 0x0020, 0x0627,
586 // 0x0644, 0x0644, 0x063a, 0x0629, 0x0020, 0x0627, 0x0644,
587 // 0x0639, 0x0631, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
588 // 0x0628, 0x0628, 0x0646, 0x0638, 0x0645, 0x0020, 0x0643,
589 // 0x062a, 0x0627, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
590 // 0x062c, 0x0645, 0x064a, 0x0644, 0x0629, 0
591 // };
592 // UnicodeString ar(ar_raw);
593 // UErrorCode status=U_ZERO_ERROR;
594 // UParseError parseError;
595 // Transliterator *t = Transliterator::createInstance("Latin-Arabic", UTRANS_FORWARD, parseError, status);
596 // if (t == 0) {
597 // errln("FAIL: createInstance failed");
598 // return;
599 // }
600 // expect(*t, "Arabic", ar);
601 // delete t;
602 }
603
604 /**
605 * Compose the Kana transliterator forward and reverse and try
606 * some strings that should come out unchanged.
607 */
TestCompoundKana()608 void TransliteratorTest::TestCompoundKana() {
609 UParseError parseError;
610 UErrorCode status = U_ZERO_ERROR;
611 Transliterator* t = Transliterator::createInstance("Latin-Hiragana;Hiragana-Latin", UTRANS_FORWARD, parseError, status);
612 if (t == 0) {
613 dataerrln("FAIL: construction of Latin-Hiragana;Hiragana-Latin failed - %s", u_errorName(status));
614 } else {
615 expect(*t, "aaaaa", "aaaaa");
616 delete t;
617 }
618 }
619
620 /**
621 * Compose the hex transliterators forward and reverse.
622 */
TestCompoundHex()623 void TransliteratorTest::TestCompoundHex() {
624 UParseError parseError;
625 UErrorCode status = U_ZERO_ERROR;
626 Transliterator* a = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
627 Transliterator* b = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, parseError, status);
628 Transliterator* transab[] = { a, b };
629 Transliterator* transba[] = { b, a };
630 if (a == 0 || b == 0) {
631 errln("FAIL: construction failed");
632 delete a;
633 delete b;
634 return;
635 }
636 // Do some basic tests of a
637 expect(*a, "01", UnicodeString("\\u0030\\u0031", ""));
638 // Do some basic tests of b
639 expect(*b, UnicodeString("\\u0030\\u0031", ""), "01");
640
641 Transliterator* ab = new CompoundTransliterator(transab, 2);
642 UnicodeString s("abcde", "");
643 expect(*ab, s, s);
644
645 UnicodeString str(s);
646 a->transliterate(str);
647 Transliterator* ba = new CompoundTransliterator(transba, 2);
648 expect(*ba, str, str);
649
650 delete ab;
651 delete ba;
652 delete a;
653 delete b;
654 }
655
656 int gTestFilterClassID = 0;
657 /**
658 * Used by TestFiltering().
659 */
660 class TestFilter : public UnicodeFilter {
clone() const661 virtual TestFilter* clone() const override {
662 return new TestFilter(*this);
663 }
contains(UChar32 c) const664 virtual UBool contains(UChar32 c) const override {
665 return c != (char16_t)0x0063 /*c*/;
666 }
667 // Stubs
toPattern(UnicodeString & result,UBool) const668 virtual UnicodeString& toPattern(UnicodeString& result,
669 UBool /*escapeUnprintable*/) const override {
670 return result;
671 }
matchesIndexValue(uint8_t) const672 virtual UBool matchesIndexValue(uint8_t /*v*/) const override {
673 return false;
674 }
addMatchSetTo(UnicodeSet &) const675 virtual void addMatchSetTo(UnicodeSet& /*toUnionTo*/) const override {}
676 public:
getDynamicClassID() const677 UClassID getDynamicClassID() const override { return (UClassID)&gTestFilterClassID; }
678 };
679
680 /**
681 * Do some basic tests of filtering.
682 */
TestFiltering()683 void TransliteratorTest::TestFiltering() {
684 UParseError parseError;
685 UErrorCode status = U_ZERO_ERROR;
686 Transliterator* hex = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
687 if (hex == 0) {
688 errln("FAIL: createInstance(Any-Hex) failed");
689 return;
690 }
691 hex->adoptFilter(new TestFilter());
692 UnicodeString s("abcde");
693 hex->transliterate(s);
694 UnicodeString exp("\\u0061\\u0062c\\u0064\\u0065", "");
695 if (s == exp) {
696 logln(UnicodeString("Ok: \"") + exp + "\"");
697 } else {
698 logln(UnicodeString("FAIL: \"") + s + "\", wanted \"" + exp + "\"");
699 }
700
701 // ICU4C ONLY. Do not find Transliterator.orphanFilter() in ICU4J.
702 UnicodeFilter *f = hex->orphanFilter();
703 if (f == nullptr){
704 errln("FAIL: orphanFilter() should get a UnicodeFilter");
705 } else {
706 delete f;
707 }
708 delete hex;
709 }
710
711 /**
712 * Test anchors
713 */
TestAnchors()714 void TransliteratorTest::TestAnchors() {
715 expect(UnicodeString("^a > 0; a$ > 2 ; a > 1;", ""),
716 "aaa",
717 "012");
718 expect(UnicodeString("$s=[z$]; $s{a>0; a}$s>2; a>1;", ""),
719 "aaa",
720 "012");
721 expect(UnicodeString("^ab > 01 ;"
722 " ab > |8 ;"
723 " b > k ;"
724 " 8x$ > 45 ;"
725 " 8x > 77 ;", ""),
726
727 "ababbabxabx",
728 "018k7745");
729 expect(UnicodeString("$s = [z$] ;"
730 "$s{ab > 01 ;"
731 " ab > |8 ;"
732 " b > k ;"
733 " 8x}$s > 45 ;"
734 " 8x > 77 ;", ""),
735
736 "abzababbabxzabxabx",
737 "01z018k45z01x45");
738 }
739
740 /**
741 * Test pattern quoting and escape mechanisms.
742 */
TestPatternQuoting()743 void TransliteratorTest::TestPatternQuoting() {
744 // Array of 3n items
745 // Each item is <rules>, <input>, <expected output>
746 const UnicodeString DATA[] = {
747 UnicodeString(char16_t(0x4E01)) + ">'[male adult]'",
748 UnicodeString(char16_t(0x4E01)),
749 "[male adult]"
750 };
751
752 for (int32_t i=0; i<3; i+=3) {
753 logln(UnicodeString("Pattern: ") + prettify(DATA[i]));
754 UParseError parseError;
755 UErrorCode status = U_ZERO_ERROR;
756 Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
757 if (U_FAILURE(status)) {
758 errln("RBT constructor failed");
759 } else {
760 expect(*t, DATA[i+1], DATA[i+2]);
761 }
762 delete t;
763 }
764 }
765
766 /**
767 * Regression test for bugs found in Greek transliteration.
768 */
TestJ277()769 void TransliteratorTest::TestJ277() {
770 UErrorCode status = U_ZERO_ERROR;
771 UParseError parseError;
772 Transliterator *gl = Transliterator::createInstance("Greek-Latin; NFD; [:M:]Remove; NFC", UTRANS_FORWARD, parseError, status);
773 if (gl == nullptr) {
774 dataerrln("FAIL: createInstance(Greek-Latin) returned nullptr - %s", u_errorName(status));
775 return;
776 }
777
778 char16_t sigma = 0x3C3;
779 char16_t upsilon = 0x3C5;
780 char16_t nu = 0x3BD;
781 // char16_t PHI = 0x3A6;
782 char16_t alpha = 0x3B1;
783 // char16_t omega = 0x3C9;
784 // char16_t omicron = 0x3BF;
785 // char16_t epsilon = 0x3B5;
786
787 // sigma upsilon nu -> syn
788 UnicodeString syn;
789 syn.append(sigma).append(upsilon).append(nu);
790 expect(*gl, syn, "syn");
791
792 // sigma alpha upsilon nu -> saun
793 UnicodeString sayn;
794 sayn.append(sigma).append(alpha).append(upsilon).append(nu);
795 expect(*gl, sayn, "saun");
796
797 // Again, using a smaller rule set
798 UnicodeString rules(
799 "$alpha = \\u03B1;"
800 "$nu = \\u03BD;"
801 "$sigma = \\u03C3;"
802 "$ypsilon = \\u03C5;"
803 "$vowel = [aeiouAEIOU$alpha$ypsilon];"
804 "s <> $sigma;"
805 "a <> $alpha;"
806 "u <> $vowel { $ypsilon;"
807 "y <> $ypsilon;"
808 "n <> $nu;",
809 "");
810 Transliterator *mini = Transliterator::createFromRules("mini", rules, UTRANS_REVERSE, parseError, status);
811 if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
812 expect(*mini, syn, "syn");
813 expect(*mini, sayn, "saun");
814 delete mini;
815 mini = nullptr;
816
817 #if !UCONFIG_NO_FORMATTING
818 // Transliterate the Greek locale data
819 Locale el("el");
820 DateFormatSymbols syms(el, status);
821 if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
822 int32_t i, count;
823 const UnicodeString* data = syms.getMonths(count);
824 for (i=0; i<count; ++i) {
825 if (data[i].length() == 0) {
826 continue;
827 }
828 UnicodeString out(data[i]);
829 gl->transliterate(out);
830 UBool ok = true;
831 if (data[i].length() >= 2 && out.length() >= 2 &&
832 u_isupper(data[i].charAt(0)) && u_islower(data[i].charAt(1))) {
833 if (!(u_isupper(out.charAt(0)) && u_islower(out.charAt(1)))) {
834 ok = false;
835 }
836 }
837 if (ok) {
838 logln(prettify(data[i] + " -> " + out));
839 } else {
840 errln(UnicodeString("FAIL: ") + prettify(data[i] + " -> " + out));
841 }
842 }
843 #endif
844
845 delete gl;
846 }
847
848 /**
849 * Prefix, suffix support in hex transliterators
850 */
TestJ243()851 void TransliteratorTest::TestJ243() {
852 UErrorCode ec = U_ZERO_ERROR;
853
854 // Test default Hex-Any, which should handle
855 // \u, \U, u+, and U+
856 Transliterator *hex =
857 Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, ec);
858 if (assertSuccess("getInstance", ec)) {
859 expect(*hex, UnicodeString("\\u0041+\\U00000042,U+0043uU+0044z", ""), "A+B,CuDz");
860 }
861 delete hex;
862
863 // // Try a custom Hex-Unicode
864 // // \uXXXX and &#xXXXX;
865 // ec = U_ZERO_ERROR;
866 // HexToUnicodeTransliterator hex2(UnicodeString("\\\\u###0;&\\#x###0\\;", ""), ec);
867 // expect(hex2, UnicodeString("\\u61\\u062\\u0063\\u00645\\u66x0123", ""),
868 // "abcd5fx0123");
869 // // Try custom Any-Hex (default is tested elsewhere)
870 // ec = U_ZERO_ERROR;
871 // UnicodeToHexTransliterator hex3(UnicodeString("&\\#x###0;", ""), ec);
872 // expect(hex3, "012", "012");
873 }
874
875 /**
876 * Parsers need better syntax error messages.
877 */
TestJ329()878 void TransliteratorTest::TestJ329() {
879
880 struct { UBool containsErrors; const char* rule; } DATA[] = {
881 { false, "a > b; c > d" },
882 { true, "a > b; no operator; c > d" },
883 };
884 int32_t DATA_length = UPRV_LENGTHOF(DATA);
885
886 for (int32_t i=0; i<DATA_length; ++i) {
887 UErrorCode status = U_ZERO_ERROR;
888 UParseError parseError;
889 Transliterator *rbt = Transliterator::createFromRules("<ID>",
890 DATA[i].rule,
891 UTRANS_FORWARD,
892 parseError,
893 status);
894 UBool gotError = U_FAILURE(status);
895 UnicodeString desc(DATA[i].rule);
896 desc.append(gotError ? " -> error" : " -> no error");
897 if (gotError) {
898 desc = desc + ", ParseError code=" + u_errorName(status) +
899 " line=" + parseError.line +
900 " offset=" + parseError.offset +
901 " context=" + parseError.preContext;
902 }
903 if (gotError == DATA[i].containsErrors) {
904 logln(UnicodeString("Ok: ") + desc);
905 } else {
906 errln(UnicodeString("FAIL: ") + desc);
907 }
908 delete rbt;
909 }
910 }
911
912 /**
913 * Test segments and segment references.
914 */
TestSegments()915 void TransliteratorTest::TestSegments() {
916 // Array of 3n items
917 // Each item is <rules>, <input>, <expected output>
918 UnicodeString DATA[] = {
919 "([a-z]) '.' ([0-9]) > $2 '-' $1",
920 "abc.123.xyz.456",
921 "ab1-c23.xy4-z56",
922
923 // nested
924 "(([a-z])([0-9])) > $1 '.' $2 '.' $3;",
925 "a1 b2",
926 "a1.a.1 b2.b.2",
927 };
928 int32_t DATA_length = UPRV_LENGTHOF(DATA);
929
930 for (int32_t i=0; i<DATA_length; i+=3) {
931 logln("Pattern: " + prettify(DATA[i]));
932 UParseError parseError;
933 UErrorCode status = U_ZERO_ERROR;
934 Transliterator *t = Transliterator::createFromRules("ID", DATA[i], UTRANS_FORWARD, parseError, status);
935 if (U_FAILURE(status)) {
936 errln("FAIL: RBT constructor");
937 } else {
938 expect(*t, DATA[i+1], DATA[i+2]);
939 }
940 delete t;
941 }
942 }
943
944 /**
945 * Test cursor positioning outside of the key
946 */
TestCursorOffset()947 void TransliteratorTest::TestCursorOffset() {
948 // Array of 3n items
949 // Each item is <rules>, <input>, <expected output>
950 UnicodeString DATA[] = {
951 "pre {alpha} post > | @ ALPHA ;"
952 "eALPHA > beta ;"
953 "pre {beta} post > BETA @@ | ;"
954 "post > xyz",
955
956 "prealphapost prebetapost",
957
958 "prbetaxyz preBETApost",
959 };
960 int32_t DATA_length = UPRV_LENGTHOF(DATA);
961
962 for (int32_t i=0; i<DATA_length; i+=3) {
963 logln("Pattern: " + prettify(DATA[i]));
964 UParseError parseError;
965 UErrorCode status = U_ZERO_ERROR;
966 Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
967 if (U_FAILURE(status)) {
968 errln("FAIL: RBT constructor");
969 } else {
970 expect(*t, DATA[i+1], DATA[i+2]);
971 }
972 delete t;
973 }
974 }
975
976 /**
977 * Test zero length and > 1 char length variable values. Test
978 * use of variable refs in UnicodeSets.
979 */
TestArbitraryVariableValues()980 void TransliteratorTest::TestArbitraryVariableValues() {
981 // Array of 3n items
982 // Each item is <rules>, <input>, <expected output>
983 UnicodeString DATA[] = {
984 "$abe = ab;"
985 "$pat = x[yY]z;"
986 "$ll = 'a-z';"
987 "$llZ = [$ll];"
988 "$llY = [$ll$pat];"
989 "$emp = ;"
990
991 "$abe > ABE;"
992 "$pat > END;"
993 "$llZ > 1;"
994 "$llY > 2;"
995 "7$emp 8 > 9;"
996 "",
997
998 "ab xYzxyz stY78",
999 "ABE ENDEND 1129",
1000 };
1001 int32_t DATA_length = UPRV_LENGTHOF(DATA);
1002
1003 for (int32_t i=0; i<DATA_length; i+=3) {
1004 logln("Pattern: " + prettify(DATA[i]));
1005 UParseError parseError;
1006 UErrorCode status = U_ZERO_ERROR;
1007 Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
1008 if (U_FAILURE(status)) {
1009 errln("FAIL: RBT constructor");
1010 } else {
1011 expect(*t, DATA[i+1], DATA[i+2]);
1012 }
1013 delete t;
1014 }
1015 }
1016
1017 /**
1018 * Confirm that the contextStart, contextLimit, start, and limit
1019 * behave correctly. J474.
1020 */
TestPositionHandling()1021 void TransliteratorTest::TestPositionHandling() {
1022 // Array of 3n items
1023 // Each item is <rules>, <input>, <expected output>
1024 const char* DATA[] = {
1025 "a{t} > SS ; {t}b > UU ; {t} > TT ;",
1026 "xtat txtb", // pos 0,9,0,9
1027 "xTTaSS TTxUUb",
1028
1029 "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1030 "xtat txtb", // pos 2,9,3,8
1031 "xtaSS TTxUUb",
1032
1033 "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1034 "xtat txtb", // pos 3,8,3,8
1035 "xtaTT TTxTTb",
1036 };
1037
1038 // Array of 4n positions -- these go with the DATA array
1039 // They are: contextStart, contextLimit, start, limit
1040 int32_t POS[] = {
1041 0, 9, 0, 9,
1042 2, 9, 3, 8,
1043 3, 8, 3, 8,
1044 };
1045
1046 int32_t n = UPRV_LENGTHOF(DATA) / 3;
1047 for (int32_t i=0; i<n; i++) {
1048 UErrorCode status = U_ZERO_ERROR;
1049 UParseError parseError;
1050 Transliterator *t = Transliterator::createFromRules("<ID>",
1051 DATA[3*i], UTRANS_FORWARD, parseError, status);
1052 if (U_FAILURE(status)) {
1053 delete t;
1054 errln("FAIL: RBT constructor");
1055 return;
1056 }
1057 UTransPosition pos;
1058 pos.contextStart= POS[4*i];
1059 pos.contextLimit = POS[4*i+1];
1060 pos.start = POS[4*i+2];
1061 pos.limit = POS[4*i+3];
1062 UnicodeString rsource(DATA[3*i+1]);
1063 t->transliterate(rsource, pos, status);
1064 if (U_FAILURE(status)) {
1065 delete t;
1066 errln("FAIL: transliterate");
1067 return;
1068 }
1069 t->finishTransliteration(rsource, pos);
1070 expectAux(DATA[3*i],
1071 DATA[3*i+1],
1072 rsource,
1073 DATA[3*i+2]);
1074 delete t;
1075 }
1076 }
1077
1078 /**
1079 * Test the Hiragana-Katakana transliterator.
1080 */
TestHiraganaKatakana()1081 void TransliteratorTest::TestHiraganaKatakana() {
1082 UParseError parseError;
1083 UErrorCode status = U_ZERO_ERROR;
1084 Transliterator* hk = Transliterator::createInstance("Hiragana-Katakana", UTRANS_FORWARD, parseError, status);
1085 Transliterator* kh = Transliterator::createInstance("Katakana-Hiragana", UTRANS_FORWARD, parseError, status);
1086 if (hk == 0 || kh == 0) {
1087 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1088 delete hk;
1089 delete kh;
1090 return;
1091 }
1092
1093 // Array of 3n items
1094 // Each item is "hk"|"kh"|"both", <Hiragana>, <Katakana>
1095 const char* DATA[] = {
1096 "both",
1097 "\\u3042\\u3090\\u3099\\u3092\\u3050",
1098 "\\u30A2\\u30F8\\u30F2\\u30B0",
1099
1100 "kh",
1101 "\\u307C\\u3051\\u3060\\u3042\\u3093\\u30FC",
1102 "\\u30DC\\u30F6\\u30C0\\u30FC\\u30F3\\u30FC",
1103 };
1104 int32_t DATA_length = UPRV_LENGTHOF(DATA);
1105
1106 for (int32_t i=0; i<DATA_length; i+=3) {
1107 UnicodeString h = CharsToUnicodeString(DATA[i+1]);
1108 UnicodeString k = CharsToUnicodeString(DATA[i+2]);
1109 switch (*DATA[i]) {
1110 case 0x68: //'h': // Hiragana-Katakana
1111 expect(*hk, h, k);
1112 break;
1113 case 0x6B: //'k': // Katakana-Hiragana
1114 expect(*kh, k, h);
1115 break;
1116 case 0x62: //'b': // both
1117 expect(*hk, h, k);
1118 expect(*kh, k, h);
1119 break;
1120 }
1121 }
1122 delete hk;
1123 delete kh;
1124 }
1125
1126 /**
1127 * Test cloning / copy constructor of RBT.
1128 */
TestCopyJ476()1129 void TransliteratorTest::TestCopyJ476() {
1130 // The real test here is what happens when the destructors are
1131 // called. So we let one object get destructed, and check to
1132 // see that its copy still works.
1133 Transliterator *t2 = 0;
1134 {
1135 UParseError parseError;
1136 UErrorCode status = U_ZERO_ERROR;
1137 Transliterator *t1 = Transliterator::createFromRules("t1",
1138 "a>A;b>B;'foo'+>'bar'", UTRANS_FORWARD, parseError, status);
1139 if (U_FAILURE(status)) {
1140 errln("FAIL: RBT constructor");
1141 return;
1142 }
1143 t2 = t1->clone(); // Call copy constructor under the covers.
1144 expect(*t1, "abcfoofoo", "ABcbar");
1145 delete t1;
1146 }
1147 expect(*t2, "abcfoofoo", "ABcbar");
1148 delete t2;
1149 }
1150
1151 /**
1152 * Test inter-Indic transliterators. These are composed.
1153 * ICU4C Jitterbug 483.
1154 */
TestInterIndic()1155 void TransliteratorTest::TestInterIndic() {
1156 UnicodeString ID("Devanagari-Gujarati", "");
1157 UErrorCode status = U_ZERO_ERROR;
1158 UParseError parseError;
1159 Transliterator* dg = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1160 if (dg == 0) {
1161 dataerrln("FAIL: createInstance(" + ID + ") returned nullptr - " + u_errorName(status));
1162 return;
1163 }
1164 UnicodeString id = dg->getID();
1165 if (id != ID) {
1166 errln("FAIL: createInstance(" + ID + ")->getID() => " + id);
1167 }
1168 UnicodeString dev = CharsToUnicodeString("\\u0901\\u090B\\u0925");
1169 UnicodeString guj = CharsToUnicodeString("\\u0A81\\u0A8B\\u0AA5");
1170 expect(*dg, dev, guj);
1171 delete dg;
1172 }
1173
1174 /**
1175 * Test filter syntax in IDs. (J918)
1176 */
TestFilterIDs()1177 void TransliteratorTest::TestFilterIDs() {
1178 // Array of 3n strings:
1179 // <id>, <inverse id>, <input>, <expected output>
1180 const char* DATA[] = {
1181 "[aeiou]Any-Hex", // ID
1182 "[aeiou]Hex-Any", // expected inverse ID
1183 "quizzical", // src
1184 "q\\u0075\\u0069zz\\u0069c\\u0061l", // expected ID.translit(src)
1185
1186 "[aeiou]Any-Hex;[^5]Hex-Any",
1187 "[^5]Any-Hex;[aeiou]Hex-Any",
1188 "quizzical",
1189 "q\\u0075izzical",
1190
1191 "[abc]Null",
1192 "[abc]Null",
1193 "xyz",
1194 "xyz",
1195 };
1196 enum { DATA_length = UPRV_LENGTHOF(DATA) };
1197
1198 for (int i=0; i<DATA_length; i+=4) {
1199 UnicodeString ID(DATA[i], "");
1200 UnicodeString uID(DATA[i+1], "");
1201 UnicodeString data2(DATA[i+2], "");
1202 UnicodeString data3(DATA[i+3], "");
1203 UParseError parseError;
1204 UErrorCode status = U_ZERO_ERROR;
1205 Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1206 if (t == 0) {
1207 errln("FAIL: createInstance(" + ID + ") returned nullptr");
1208 return;
1209 }
1210 expect(*t, data2, data3);
1211
1212 // Check the ID
1213 if (ID != t->getID()) {
1214 errln("FAIL: createInstance(" + ID + ").getID() => " +
1215 t->getID());
1216 }
1217
1218 // Check the inverse
1219 Transliterator *u = t->createInverse(status);
1220 if (u == 0) {
1221 errln("FAIL: " + ID + ".createInverse() returned nullptr");
1222 } else if (u->getID() != uID) {
1223 errln("FAIL: " + ID + ".createInverse().getID() => " +
1224 u->getID() + ", expected " + uID);
1225 }
1226
1227 delete t;
1228 delete u;
1229 }
1230 }
1231
1232 /**
1233 * Test the case mapping transliterators.
1234 */
TestCaseMap()1235 void TransliteratorTest::TestCaseMap() {
1236 UParseError parseError;
1237 UErrorCode status = U_ZERO_ERROR;
1238 Transliterator* toUpper =
1239 Transliterator::createInstance("Any-Upper[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1240 Transliterator* toLower =
1241 Transliterator::createInstance("Any-Lower[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1242 Transliterator* toTitle =
1243 Transliterator::createInstance("Any-Title[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1244 if (toUpper==0 || toLower==0 || toTitle==0) {
1245 errln("FAIL: createInstance returned nullptr");
1246 delete toUpper;
1247 delete toLower;
1248 delete toTitle;
1249 return;
1250 }
1251
1252 expect(*toUpper, "The quick brown fox jumped over the lazy dogs.",
1253 "THE QUICK BROWN FOx JUMPED OVER THE LAzy DOGS.");
1254 expect(*toLower, "The quIck brown fOX jUMPED OVER THE LAzY dogs.",
1255 "the quick brown foX jumped over the lazY dogs.");
1256 expect(*toTitle, "the quick brown foX can't jump over the laZy dogs.",
1257 "The Quick Brown FoX Can't Jump Over The LaZy Dogs.");
1258
1259 delete toUpper;
1260 delete toLower;
1261 delete toTitle;
1262 }
1263
1264 /**
1265 * Test the name mapping transliterators.
1266 */
TestNameMap()1267 void TransliteratorTest::TestNameMap() {
1268 UParseError parseError;
1269 UErrorCode status = U_ZERO_ERROR;
1270 Transliterator* uni2name =
1271 Transliterator::createInstance("Any-Name[^abc]", UTRANS_FORWARD, parseError, status);
1272 Transliterator* name2uni =
1273 Transliterator::createInstance("Name-Any", UTRANS_FORWARD, parseError, status);
1274 if (uni2name==0 || name2uni==0) {
1275 errln("FAIL: createInstance returned nullptr");
1276 delete uni2name;
1277 delete name2uni;
1278 return;
1279 }
1280
1281 // Careful: CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1282 expect(*uni2name, CharsToUnicodeString("\\u00A0abc\\u4E01\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF"),
1283 CharsToUnicodeString("\\\\N{NO-BREAK SPACE}abc\\\\N{CJK UNIFIED IDEOGRAPH-4E01}\\\\N{MICRO SIGN}\\\\N{GUJARATI SIGN CANDRABINDU}\\\\N{REPLACEMENT CHARACTER}\\\\N{<control-0004>}\\\\N{<control-0009>}\\\\N{<control-0081>}\\\\N{<noncharacter-FFFF>}"));
1284 expect(*name2uni, UNICODE_STRING_SIMPLE("{\\N { NO-BREAK SPACE}abc\\N{ CJK UNIFIED IDEOGRAPH-4E01 }\\N{x\\N{MICRO SIGN}\\N{GUJARATI SIGN CANDRABINDU}\\N{REPLACEMENT CHARACTER}\\N{<control-0004>}\\N{<control-0009>}\\N{<control-0081>}\\N{<noncharacter-FFFF>}\\N{<control-0004>}\\N{"),
1285 CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{"));
1286
1287 delete uni2name;
1288 delete name2uni;
1289
1290 // round trip
1291 Transliterator* t =
1292 Transliterator::createInstance("Any-Name;Name-Any", UTRANS_FORWARD, parseError, status);
1293 if (t==0) {
1294 errln("FAIL: createInstance returned nullptr");
1295 delete t;
1296 return;
1297 }
1298
1299 // Careful: CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1300 UnicodeString s = CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{");
1301 expect(*t, s, s);
1302 delete t;
1303 }
1304
1305 /**
1306 * Test liberalized ID syntax. 1006c
1307 */
TestLiberalizedID()1308 void TransliteratorTest::TestLiberalizedID() {
1309 // Some test cases have an expected getID() value of nullptr. This
1310 // means I have disabled the test case for now. This stuff is
1311 // still under development, and I haven't decided whether to make
1312 // getID() return canonical case yet. It will all get rewritten
1313 // with the move to Source-Target/Variant IDs anyway. [aliu]
1314 const char* DATA[] = {
1315 "latin-greek", nullptr /*"Latin-Greek"*/, "case insensitivity",
1316 " Null ", "Null", "whitespace",
1317 " Latin[a-z]-Greek ", "[a-z]Latin-Greek", "inline filter",
1318 " null ; latin-greek ", nullptr /*"Null;Latin-Greek"*/, "compound whitespace",
1319 };
1320 const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1321 UParseError parseError;
1322 UErrorCode status= U_ZERO_ERROR;
1323 for (int32_t i=0; i<DATA_length; i+=3) {
1324 Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, parseError, status);
1325 if (t == 0) {
1326 dataerrln(UnicodeString("FAIL: ") + DATA[i+2] +
1327 " cannot create ID \"" + DATA[i] + "\" - " + u_errorName(status));
1328 } else {
1329 UnicodeString exp;
1330 if (DATA[i+1]) {
1331 exp = UnicodeString(DATA[i+1], "");
1332 }
1333 // Don't worry about getID() if the expected char*
1334 // is nullptr -- see above.
1335 if (exp.length() == 0 || exp == t->getID()) {
1336 logln(UnicodeString("Ok: ") + DATA[i+2] +
1337 " create ID \"" + DATA[i] + "\" => \"" +
1338 exp + "\"");
1339 } else {
1340 errln(UnicodeString("FAIL: ") + DATA[i+2] +
1341 " create ID \"" + DATA[i] + "\" => \"" +
1342 t->getID() + "\", exp \"" + exp + "\"");
1343 }
1344 delete t;
1345 }
1346 }
1347 }
1348
1349 /* test for Jitterbug 912 */
TestCreateInstance()1350 void TransliteratorTest::TestCreateInstance(){
1351 const char* FORWARD = "F";
1352 const char* REVERSE = "R";
1353 const char* DATA[] = {
1354 // Column 1: id
1355 // Column 2: direction
1356 // Column 3: expected ID, or "" if expect failure
1357 "Latin-Hangul", REVERSE, "Hangul-Latin", // JB#912
1358
1359 // JB#2689: bad compound causes crash
1360 "InvalidSource-InvalidTarget", FORWARD, "",
1361 "InvalidSource-InvalidTarget", REVERSE, "",
1362 "Hex-Any;InvalidSource-InvalidTarget", FORWARD, "",
1363 "Hex-Any;InvalidSource-InvalidTarget", REVERSE, "",
1364 "InvalidSource-InvalidTarget;Hex-Any", FORWARD, "",
1365 "InvalidSource-InvalidTarget;Hex-Any", REVERSE, "",
1366
1367 nullptr
1368 };
1369
1370 for (int32_t i=0; DATA[i]; i+=3) {
1371 UParseError err;
1372 UErrorCode ec = U_ZERO_ERROR;
1373 UnicodeString id(DATA[i]);
1374 UTransDirection dir = (DATA[i+1]==FORWARD)?
1375 UTRANS_FORWARD:UTRANS_REVERSE;
1376 UnicodeString expID(DATA[i+2]);
1377 Transliterator* t =
1378 Transliterator::createInstance(id,dir,err,ec);
1379 UnicodeString newID;
1380 if (t) {
1381 newID = t->getID();
1382 }
1383 UBool ok = (newID == expID);
1384 if (!t) {
1385 newID = u_errorName(ec);
1386 }
1387 if (ok) {
1388 logln((UnicodeString)"Ok: createInstance(" +
1389 id + "," + DATA[i+1] + ") => " + newID);
1390 } else {
1391 dataerrln((UnicodeString)"FAIL: createInstance(" +
1392 id + "," + DATA[i+1] + ") => " + newID +
1393 ", expected " + expID);
1394 }
1395 delete t;
1396 }
1397 }
1398
1399 /**
1400 * Test the normalization transliterator.
1401 */
TestNormalizationTransliterator()1402 void TransliteratorTest::TestNormalizationTransliterator() {
1403 // THE FOLLOWING TWO TABLES ARE COPIED FROM com.ibm.test.normalizer.BasicTest
1404 // PLEASE KEEP THEM IN SYNC WITH BasicTest.
1405 const char* CANON[] = {
1406 // Input Decomposed Composed
1407 "cat", "cat", "cat" ,
1408 "\\u00e0ardvark", "a\\u0300ardvark", "\\u00e0ardvark" ,
1409
1410 "\\u1e0a", "D\\u0307", "\\u1e0a" , // D-dot_above
1411 "D\\u0307", "D\\u0307", "\\u1e0a" , // D dot_above
1412
1413 "\\u1e0c\\u0307", "D\\u0323\\u0307", "\\u1e0c\\u0307" , // D-dot_below dot_above
1414 "\\u1e0a\\u0323", "D\\u0323\\u0307", "\\u1e0c\\u0307" , // D-dot_above dot_below
1415 "D\\u0307\\u0323", "D\\u0323\\u0307", "\\u1e0c\\u0307" , // D dot_below dot_above
1416
1417 "\\u1e10\\u0307\\u0323", "D\\u0327\\u0323\\u0307","\\u1e10\\u0323\\u0307", // D dot_below cedilla dot_above
1418 "D\\u0307\\u0328\\u0323","D\\u0328\\u0323\\u0307","\\u1e0c\\u0328\\u0307", // D dot_above ogonek dot_below
1419
1420 "\\u1E14", "E\\u0304\\u0300", "\\u1E14" , // E-macron-grave
1421 "\\u0112\\u0300", "E\\u0304\\u0300", "\\u1E14" , // E-macron + grave
1422 "\\u00c8\\u0304", "E\\u0300\\u0304", "\\u00c8\\u0304" , // E-grave + macron
1423
1424 "\\u212b", "A\\u030a", "\\u00c5" , // angstrom_sign
1425 "\\u00c5", "A\\u030a", "\\u00c5" , // A-ring
1426
1427 "\\u00fdffin", "y\\u0301ffin", "\\u00fdffin" , //updated with 3.0
1428 "\\u00fd\\uFB03n", "y\\u0301\\uFB03n", "\\u00fd\\uFB03n" , //updated with 3.0
1429
1430 "Henry IV", "Henry IV", "Henry IV" ,
1431 "Henry \\u2163", "Henry \\u2163", "Henry \\u2163" ,
1432
1433 "\\u30AC", "\\u30AB\\u3099", "\\u30AC" , // ga (Katakana)
1434 "\\u30AB\\u3099", "\\u30AB\\u3099", "\\u30AC" , // ka + ten
1435 "\\uFF76\\uFF9E", "\\uFF76\\uFF9E", "\\uFF76\\uFF9E" , // hw_ka + hw_ten
1436 "\\u30AB\\uFF9E", "\\u30AB\\uFF9E", "\\u30AB\\uFF9E" , // ka + hw_ten
1437 "\\uFF76\\u3099", "\\uFF76\\u3099", "\\uFF76\\u3099" , // hw_ka + ten
1438
1439 "A\\u0300\\u0316", "A\\u0316\\u0300", "\\u00C0\\u0316" ,
1440 0 // end
1441 };
1442
1443 const char* COMPAT[] = {
1444 // Input Decomposed Composed
1445 "\\uFB4f", "\\u05D0\\u05DC", "\\u05D0\\u05DC" , // Alef-Lamed vs. Alef, Lamed
1446
1447 "\\u00fdffin", "y\\u0301ffin", "\\u00fdffin" , //updated for 3.0
1448 "\\u00fd\\uFB03n", "y\\u0301ffin", "\\u00fdffin" , // ffi ligature -> f + f + i
1449
1450 "Henry IV", "Henry IV", "Henry IV" ,
1451 "Henry \\u2163", "Henry IV", "Henry IV" ,
1452
1453 "\\u30AC", "\\u30AB\\u3099", "\\u30AC" , // ga (Katakana)
1454 "\\u30AB\\u3099", "\\u30AB\\u3099", "\\u30AC" , // ka + ten
1455
1456 "\\uFF76\\u3099", "\\u30AB\\u3099", "\\u30AC" , // hw_ka + ten
1457 0 // end
1458 };
1459
1460 int32_t i;
1461 UParseError parseError;
1462 UErrorCode status = U_ZERO_ERROR;
1463 Transliterator* NFD = Transliterator::createInstance("NFD", UTRANS_FORWARD, parseError, status);
1464 Transliterator* NFC = Transliterator::createInstance("NFC", UTRANS_FORWARD, parseError, status);
1465 if (!NFD || !NFC) {
1466 dataerrln("FAIL: createInstance failed: %s", u_errorName(status));
1467 delete NFD;
1468 delete NFC;
1469 return;
1470 }
1471 for (i=0; CANON[i]; i+=3) {
1472 UnicodeString in = CharsToUnicodeString(CANON[i]);
1473 UnicodeString expd = CharsToUnicodeString(CANON[i+1]);
1474 UnicodeString expc = CharsToUnicodeString(CANON[i+2]);
1475 expect(*NFD, in, expd);
1476 expect(*NFC, in, expc);
1477 }
1478 delete NFD;
1479 delete NFC;
1480
1481 Transliterator* NFKD = Transliterator::createInstance("NFKD", UTRANS_FORWARD, parseError, status);
1482 Transliterator* NFKC = Transliterator::createInstance("NFKC", UTRANS_FORWARD, parseError, status);
1483 if (!NFKD || !NFKC) {
1484 dataerrln("FAIL: createInstance failed");
1485 delete NFKD;
1486 delete NFKC;
1487 return;
1488 }
1489 for (i=0; COMPAT[i]; i+=3) {
1490 UnicodeString in = CharsToUnicodeString(COMPAT[i]);
1491 UnicodeString expkd = CharsToUnicodeString(COMPAT[i+1]);
1492 UnicodeString expkc = CharsToUnicodeString(COMPAT[i+2]);
1493 expect(*NFKD, in, expkd);
1494 expect(*NFKC, in, expkc);
1495 }
1496 delete NFKD;
1497 delete NFKC;
1498
1499 UParseError pe;
1500 status = U_ZERO_ERROR;
1501 Transliterator *t = Transliterator::createInstance("NFD; [x]Remove",
1502 UTRANS_FORWARD,
1503 pe, status);
1504 if (t == 0) {
1505 errln("FAIL: createInstance failed");
1506 }
1507 expect(*t, CharsToUnicodeString("\\u010dx"),
1508 CharsToUnicodeString("c\\u030C"));
1509 delete t;
1510 }
1511
1512 /**
1513 * Test we can create basic transliterator even without data.
1514 */
TestBasicTransliteratorEvenWithoutData()1515 void TransliteratorTest::TestBasicTransliteratorEvenWithoutData() {
1516 const char16_t* TEST_DATA = u"\u0124e\u0301 \uFB01nd x";
1517 const char16_t* EXPECTED_RESULTS[] = {
1518 u"H\u0302e\u0301 \uFB01nd x", // NFD
1519 u"\u0124\u00E9 \uFB01nd x", // NFC
1520 u"H\u0302e\u0301 find x", // NFKD
1521 u"\u0124\u00E9 find x", // NFKC
1522 u"\u0124e\u0301 \uFB01nd x", // Hex-Any
1523 u"\u0125e\u0301 \uFB01nd x", // Lower
1524 u"\u0124e\uFB01ndx", // [:^L:]Remove
1525 u"H\u0302e\u0301 \uFB01nd ", // NFD; [x]Remove
1526 u"h\u0302e\u0301 find x", // Lower; NFKD;
1527 u"hefindx", // Lower; NFKD; [:^L:]Remove; NFC;
1528 u"\u0124e \uFB01nd x", // [:Nonspacing Mark:] Remove;
1529 u"He \uFB01nd x", // NFD; [:Nonspacing Mark:] Remove; NFC;
1530 // end
1531 0
1532 };
1533
1534 const char* BASIC_TRANSLITERATOR_ID[] = {
1535 "NFD",
1536 "NFC",
1537 "NFKD",
1538 "NFKC",
1539 "Hex-Any",
1540 "Lower",
1541 "[:^L:]Remove",
1542 "NFD; [x]Remove",
1543 "Lower; NFKD;",
1544 "Lower; NFKD; [:^L:]Remove; NFC;",
1545 "[:Nonspacing Mark:] Remove;",
1546 "NFD; [:Nonspacing Mark:] Remove; NFC;",
1547 // end
1548 0
1549 };
1550 const char* BASIC_TRANSLITERATOR_RULES[] = {
1551 "::Lower; ::NFKD;",
1552 "::Lower; ::NFKD; ::[:^L:]Remove; ::NFC;",
1553 "::[:Nonspacing Mark:] Remove;",
1554 "::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;",
1555 // end
1556 0
1557 };
1558 for (int32_t i=0; BASIC_TRANSLITERATOR_ID[i]; i++) {
1559 UErrorCode status = U_ZERO_ERROR;
1560 UParseError parseError;
1561 std::unique_ptr<Transliterator> translit(Transliterator::createInstance(
1562 BASIC_TRANSLITERATOR_ID[i], UTRANS_FORWARD, parseError, status));
1563 if (translit.get() == nullptr || !U_SUCCESS(status)) {
1564 dataerrln("FAIL: createInstance %s failed", BASIC_TRANSLITERATOR_ID[i]);
1565 continue;
1566 }
1567 UnicodeString data(TEST_DATA);
1568 UnicodeString expected(EXPECTED_RESULTS[i]);
1569 translit->transliterate(data);
1570 if (data != expected) {
1571 dataerrln(UnicodeString("FAIL: expected translit(") +
1572 BASIC_TRANSLITERATOR_ID[i] + ") = '" +
1573 EXPECTED_RESULTS[i] + "' but got '" + data);
1574 continue;
1575 }
1576 }
1577 for (int32_t i=0; BASIC_TRANSLITERATOR_RULES[i]; i++) {
1578 UErrorCode status = U_ZERO_ERROR;
1579 UParseError parseError;
1580 std::unique_ptr<Transliterator> translit(Transliterator::createFromRules(
1581 "Test",
1582 BASIC_TRANSLITERATOR_RULES[i], UTRANS_FORWARD, parseError, status));
1583 if (translit.get() == nullptr || !U_SUCCESS(status)) {
1584 dataerrln("FAIL: createFromRules %s failed", BASIC_TRANSLITERATOR_RULES[i]);
1585 continue;
1586 }
1587 }
1588 }
1589
1590 /**
1591 * Test compound RBT rules.
1592 */
TestCompoundRBT()1593 void TransliteratorTest::TestCompoundRBT() {
1594 // Careful with spacing and ';' here: Phrase this exactly
1595 // as toRules() is going to return it. If toRules() changes
1596 // with regard to spacing or ';', then adjust this string.
1597 UnicodeString rule("::Hex-Any;\n"
1598 "::Any-Lower;\n"
1599 "a > '.A.';\n"
1600 "b > '.B.';\n"
1601 "::[^t]Any-Upper;", "");
1602 UParseError parseError;
1603 UErrorCode status = U_ZERO_ERROR;
1604 Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, parseError, status);
1605 if (t == 0) {
1606 errln("FAIL: createFromRules failed");
1607 return;
1608 }
1609 expect(*t, UNICODE_STRING_SIMPLE("\\u0043at in the hat, bat on the mat"),
1610 "C.A.t IN tHE H.A.t, .B..A.t ON tHE M.A.t");
1611 UnicodeString r;
1612 t->toRules(r, true);
1613 if (r == rule) {
1614 logln((UnicodeString)"OK: toRules() => " + r);
1615 } else {
1616 errln((UnicodeString)"FAIL: toRules() => " + r +
1617 ", expected " + rule);
1618 }
1619 delete t;
1620
1621 // Now test toRules
1622 t = Transliterator::createInstance("Greek-Latin; Latin-Cyrillic", UTRANS_FORWARD, parseError, status);
1623 if (t == 0) {
1624 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1625 return;
1626 }
1627 UnicodeString exp("::Greek-Latin;\n::Latin-Cyrillic;");
1628 t->toRules(r, true);
1629 if (r != exp) {
1630 errln((UnicodeString)"FAIL: toRules() => " + r +
1631 ", expected " + exp);
1632 } else {
1633 logln((UnicodeString)"OK: toRules() => " + r);
1634 }
1635 delete t;
1636
1637 // Round trip the result of toRules
1638 t = Transliterator::createFromRules("Test", r, UTRANS_FORWARD, parseError, status);
1639 if (t == 0) {
1640 errln("FAIL: createFromRules #2 failed");
1641 return;
1642 } else {
1643 logln((UnicodeString)"OK: createFromRules(" + r + ") succeeded");
1644 }
1645
1646 // Test toRules again
1647 t->toRules(r, true);
1648 if (r != exp) {
1649 errln((UnicodeString)"FAIL: toRules() => " + r +
1650 ", expected " + exp);
1651 } else {
1652 logln((UnicodeString)"OK: toRules() => " + r);
1653 }
1654
1655 delete t;
1656
1657 // Test Foo(Bar) IDs. Careful with spacing in id; make it conform
1658 // to what the regenerated ID will look like.
1659 UnicodeString id("Upper(Lower);(NFKC)", "");
1660 t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
1661 if (t == 0) {
1662 errln("FAIL: createInstance #2 failed");
1663 return;
1664 }
1665 if (t->getID() == id) {
1666 logln((UnicodeString)"OK: created " + id);
1667 } else {
1668 errln((UnicodeString)"FAIL: createInstance(" + id +
1669 ").getID() => " + t->getID());
1670 }
1671
1672 Transliterator *u = t->createInverse(status);
1673 if (u == 0) {
1674 errln("FAIL: createInverse failed");
1675 delete t;
1676 return;
1677 }
1678 exp = "NFKC();Lower(Upper)";
1679 if (u->getID() == exp) {
1680 logln((UnicodeString)"OK: createInverse(" + id + ") => " +
1681 u->getID());
1682 } else {
1683 errln((UnicodeString)"FAIL: createInverse(" + id + ") => " +
1684 u->getID());
1685 }
1686 delete t;
1687 delete u;
1688 }
1689
1690 /**
1691 * Compound filter semantics were originally not implemented
1692 * correctly. Originally, each component filter f(i) is replaced by
1693 * f'(i) = f(i) && g, where g is the filter for the compound
1694 * transliterator.
1695 *
1696 * From Mark:
1697 *
1698 * Suppose and I have a transliterator X. Internally X is
1699 * "Greek-Latin; Latin-Cyrillic; Any-Lower". I use a filter [^A].
1700 *
1701 * The compound should convert all greek characters (through latin) to
1702 * cyrillic, then lowercase the result. The filter should say "don't
1703 * touch 'A' in the original". But because an intermediate result
1704 * happens to go through "A", the Greek Alpha gets hung up.
1705 */
TestCompoundFilter()1706 void TransliteratorTest::TestCompoundFilter() {
1707 UParseError parseError;
1708 UErrorCode status = U_ZERO_ERROR;
1709 Transliterator *t = Transliterator::createInstance
1710 ("Greek-Latin; Latin-Greek; Lower", UTRANS_FORWARD, parseError, status);
1711 if (t == 0) {
1712 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1713 return;
1714 }
1715 t->adoptFilter(new UnicodeSet("[^A]", status));
1716 if (U_FAILURE(status)) {
1717 errln("FAIL: UnicodeSet ct failed");
1718 delete t;
1719 return;
1720 }
1721
1722 // Only the 'A' at index 1 should remain unchanged
1723 expect(*t,
1724 CharsToUnicodeString("BA\\u039A\\u0391"),
1725 CharsToUnicodeString("\\u03b2A\\u03ba\\u03b1"));
1726 delete t;
1727 }
1728
TestRemove()1729 void TransliteratorTest::TestRemove() {
1730 UParseError parseError;
1731 UErrorCode status = U_ZERO_ERROR;
1732 Transliterator *t = Transliterator::createInstance("Remove[abc]", UTRANS_FORWARD, parseError, status);
1733 if (t == 0) {
1734 errln("FAIL: createInstance failed");
1735 return;
1736 }
1737
1738 expect(*t, "Able bodied baker's cats", "Ale odied ker's ts");
1739
1740 // extra test for RemoveTransliterator::clone(), which at one point wasn't
1741 // duplicating the filter
1742 Transliterator* t2 = t->clone();
1743 expect(*t2, "Able bodied baker's cats", "Ale odied ker's ts");
1744
1745 delete t;
1746 delete t2;
1747 }
1748
TestToRules()1749 void TransliteratorTest::TestToRules() {
1750 const char* RBT = "rbt";
1751 const char* SET = "set";
1752 static const char* DATA[] = {
1753 RBT,
1754 "$a=\\u4E61; [$a] > A;",
1755 "[\\u4E61] > A;",
1756
1757 RBT,
1758 "$white=[[:Zs:][:Zl:]]; $white{a} > A;",
1759 "[[:Zs:][:Zl:]]{a} > A;",
1760
1761 SET,
1762 "[[:Zs:][:Zl:]]",
1763 "[[:Zs:][:Zl:]]",
1764
1765 SET,
1766 "[:Ps:]",
1767 "[:Ps:]",
1768
1769 SET,
1770 "[:L:]",
1771 "[:L:]",
1772
1773 SET,
1774 "[[:L:]-[A]]",
1775 "[[:L:]-[A]]",
1776
1777 SET,
1778 "[~[:Lu:][:Ll:]]",
1779 "[~[:Lu:][:Ll:]]",
1780
1781 SET,
1782 "[~[a-z]]",
1783 "[~[a-z]]",
1784
1785 RBT,
1786 "$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
1787 "[^[:Zs:]]{a} > A;",
1788
1789 RBT,
1790 "$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
1791 "[[a-z]-[:Zs:]]{a} > A;",
1792
1793 RBT,
1794 "$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
1795 "[[:Zs:]&[a-z]]{a} > A;",
1796
1797 RBT,
1798 "$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
1799 "[x[:Zs:]]{a} > A;",
1800
1801 RBT,
1802 "$accentMinus = [ [\\u0300-\\u0345] & [:M:] - [\\u0338]] ;"
1803 "$macron = \\u0304 ;"
1804 "$evowel = [aeiouyAEIOUY] ;"
1805 "$iotasub = \\u0345 ;"
1806 "($evowel $macron $accentMinus *) i > | $1 $iotasub ;",
1807 "([AEIOUYaeiouy]\\u0304[[\\u0300-\\u0345]&[:M:]-[\\u0338]]*)i > | $1 \\u0345;",
1808
1809 RBT,
1810 "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1811 "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1812 };
1813 static const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1814
1815 for (int32_t d=0; d < DATA_length; d+=3) {
1816 if (DATA[d] == RBT) {
1817 // Transliterator test
1818 UParseError parseError;
1819 UErrorCode status = U_ZERO_ERROR;
1820 Transliterator *t = Transliterator::createFromRules("ID",
1821 UnicodeString(DATA[d+1], -1, US_INV), UTRANS_FORWARD, parseError, status);
1822 if (t == 0) {
1823 dataerrln("FAIL: createFromRules failed - %s", u_errorName(status));
1824 return;
1825 }
1826 UnicodeString rules, escapedRules;
1827 t->toRules(rules, false);
1828 t->toRules(escapedRules, true);
1829 UnicodeString expRules = CharsToUnicodeString(DATA[d+2]);
1830 UnicodeString expEscapedRules(DATA[d+2], -1, US_INV);
1831 if (rules == expRules) {
1832 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1833 " => " + rules);
1834 } else {
1835 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1836 " => " + rules + ", exp " + expRules);
1837 }
1838 if (escapedRules == expEscapedRules) {
1839 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1840 " => " + escapedRules);
1841 } else {
1842 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1843 " => " + escapedRules + ", exp " + expEscapedRules);
1844 }
1845 delete t;
1846
1847 } else {
1848 // UnicodeSet test
1849 UErrorCode status = U_ZERO_ERROR;
1850 UnicodeString pat(DATA[d+1], -1, US_INV);
1851 UnicodeString expToPat(DATA[d+2], -1, US_INV);
1852 UnicodeSet set(pat, status);
1853 if (U_FAILURE(status)) {
1854 errln("FAIL: UnicodeSet ct failed");
1855 return;
1856 }
1857 // Adjust spacing etc. as necessary.
1858 UnicodeString toPat;
1859 set.toPattern(toPat);
1860 if (expToPat == toPat) {
1861 logln((UnicodeString)"Ok: " + pat +
1862 " => " + toPat);
1863 } else {
1864 errln((UnicodeString)"FAIL: " + pat +
1865 " => " + prettify(toPat, true) +
1866 ", exp " + prettify(pat, true));
1867 }
1868 }
1869 }
1870 }
1871
TestContext()1872 void TransliteratorTest::TestContext() {
1873 UTransPosition pos = {0, 2, 0, 1}; // cs cl s l
1874 expect("de > x; {d}e > y;",
1875 "de",
1876 "ye",
1877 &pos);
1878
1879 expect("ab{c} > z;",
1880 "xadabdabcy",
1881 "xadabdabzy");
1882 }
1883
TestSupplemental()1884 void TransliteratorTest::TestSupplemental() {
1885
1886 expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];"
1887 "a > $a; $s > i;"),
1888 CharsToUnicodeString("ab\\U0001030Fx"),
1889 CharsToUnicodeString("\\U00010300bix"));
1890
1891 expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];"
1892 "$b=[A-Z\\U00010400-\\U0001044D];"
1893 "($a)($b) > $2 $1;"),
1894 CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
1895 CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
1896
1897 // k|ax\\U00010300xm
1898
1899 // k|a\\U00010400\\U00010300xm
1900 // ky|\\U00010400\\U00010300xm
1901 // ky\\U00010400|\\U00010300xm
1902
1903 // ky\\U00010400|\\U00010300\\U00010400m
1904 // ky\\U00010400y|\\U00010400m
1905 expect(CharsToUnicodeString("$a=[a\\U00010300-\\U00010323];"
1906 "$a {x} > | @ \\U00010400;"
1907 "{$a} [^\\u0000-\\uFFFF] > y;"),
1908 CharsToUnicodeString("kax\\U00010300xm"),
1909 CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
1910
1911 expectT("Any-Name",
1912 CharsToUnicodeString("\\U00010330\\U000E0061\\u00A0"),
1913 UNICODE_STRING_SIMPLE("\\N{GOTHIC LETTER AHSA}\\N{TAG LATIN SMALL LETTER A}\\N{NO-BREAK SPACE}"));
1914
1915 expectT("Any-Hex/Unicode",
1916 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1917 UNICODE_STRING_SIMPLE("U+10330U+10FF00U+E0061U+00A0"));
1918
1919 expectT("Any-Hex/C",
1920 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1921 UNICODE_STRING_SIMPLE("\\U00010330\\U0010FF00\\U000E0061\\u00A0"));
1922
1923 expectT("Any-Hex/Perl",
1924 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1925 UNICODE_STRING_SIMPLE("\\x{10330}\\x{10FF00}\\x{E0061}\\x{A0}"));
1926
1927 expectT("Any-Hex/Java",
1928 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1929 UNICODE_STRING_SIMPLE("\\uD800\\uDF30\\uDBFF\\uDF00\\uDB40\\uDC61\\u00A0"));
1930
1931 expectT("Any-Hex/XML",
1932 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1933 "𐌰􏼀󠁡 ");
1934
1935 expectT("Any-Hex/XML10",
1936 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1937 "𐌰􏼀󠁡 ");
1938
1939 expectT(UNICODE_STRING_SIMPLE("[\\U000E0000-\\U000E0FFF] Remove"),
1940 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1941 CharsToUnicodeString("\\U00010330\\U0010FF00\\u00A0"));
1942 }
1943
TestQuantifier()1944 void TransliteratorTest::TestQuantifier() {
1945
1946 // Make sure @ in a quantified anteContext works
1947 expect("a+ {b} > | @@ c; A > a; (a+ c) > '(' $1 ')';",
1948 "AAAAAb",
1949 "aaa(aac)");
1950
1951 // Make sure @ in a quantified postContext works
1952 expect("{b} a+ > c @@ |; (a+) > '(' $1 ')';",
1953 "baaaaa",
1954 "caa(aaa)");
1955
1956 // Make sure @ in a quantified postContext with seg ref works
1957 expect("{(b)} a+ > $1 @@ |; (a+) > '(' $1 ')';",
1958 "baaaaa",
1959 "baa(aaa)");
1960
1961 // Make sure @ past ante context doesn't enter ante context
1962 UTransPosition pos = {0, 5, 3, 5};
1963 expect("a+ {b} > | @@ c; x > y; (a+ c) > '(' $1 ')';",
1964 "xxxab",
1965 "xxx(ac)",
1966 &pos);
1967
1968 // Make sure @ past post context doesn't pass limit
1969 UTransPosition pos2 = {0, 4, 0, 2};
1970 expect("{b} a+ > c @@ |; x > y; a > A;",
1971 "baxx",
1972 "caxx",
1973 &pos2);
1974
1975 // Make sure @ past post context doesn't enter post context
1976 expect("{b} a+ > c @@ |; x > y; a > A;",
1977 "baxx",
1978 "cayy");
1979
1980 expect("(ab)? c > d;",
1981 "c abc ababc",
1982 "d d abd");
1983
1984 // NOTE: The (ab)+ when referenced just yields a single "ab",
1985 // not the full sequence of them. This accords with perl behavior.
1986 expect("(ab)+ {x} > '(' $1 ')';",
1987 "x abx ababxy",
1988 "x ab(ab) abab(ab)y");
1989
1990 expect("b+ > x;",
1991 "ac abc abbc abbbc",
1992 "ac axc axc axc");
1993
1994 expect("[abc]+ > x;",
1995 "qac abrc abbcs abtbbc",
1996 "qx xrx xs xtx");
1997
1998 expect("q{(ab)+} > x;",
1999 "qa qab qaba qababc qaba",
2000 "qa qx qxa qxc qxa");
2001
2002 expect("q(ab)* > x;",
2003 "qa qab qaba qababc",
2004 "xa x xa xc");
2005
2006 // NOTE: The (ab)+ when referenced just yields a single "ab",
2007 // not the full sequence of them. This accords with perl behavior.
2008 expect("q(ab)* > '(' $1 ')';",
2009 "qa qab qaba qababc",
2010 "()a (ab) (ab)a (ab)c");
2011
2012 // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
2013 // quoted string
2014 expect("'ab'+ > x;",
2015 "bb ab ababb",
2016 "bb x xb");
2017
2018 // $foo+ and $foo* -- the quantifier should apply to the entire
2019 // variable reference
2020 expect("$var = ab; $var+ > x;",
2021 "bb ab ababb",
2022 "bb x xb");
2023 }
2024
2025 class TestTrans : public Transliterator {
2026 public:
TestTrans(const UnicodeString & id)2027 TestTrans(const UnicodeString& id) : Transliterator(id, 0) {
2028 }
clone() const2029 virtual TestTrans* clone() const override {
2030 return new TestTrans(getID());
2031 }
handleTransliterate(Replaceable &,UTransPosition & offsets,UBool) const2032 virtual void handleTransliterate(Replaceable& /*text*/, UTransPosition& offsets,
2033 UBool /*isIncremental*/) const override
2034 {
2035 offsets.start = offsets.limit;
2036 }
2037 virtual UClassID getDynamicClassID() const override;
2038 static UClassID U_EXPORT2 getStaticClassID();
2039 };
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)2040 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)
2041
2042 /**
2043 * Test Source-Target/Variant.
2044 */
2045 void TransliteratorTest::TestSTV() {
2046 int32_t ns = Transliterator::countAvailableSources();
2047 logln((UnicodeString)"countAvailableSources at start: " + ns);
2048 if (ns < 0 || ns > 255) {
2049 errln((UnicodeString)"FAIL: Bad source count: " + ns);
2050 return;
2051 }
2052 int32_t i, j;
2053 for (i=0; i<ns; ++i) {
2054 UnicodeString source;
2055 Transliterator::getAvailableSource(i, source);
2056 logln((UnicodeString)"" + i + ": " + source);
2057 if (source.length() == 0) {
2058 errln("FAIL: empty source");
2059 continue;
2060 }
2061 int32_t nt = Transliterator::countAvailableTargets(source);
2062 if (nt < 0 || nt > 255) {
2063 errln((UnicodeString)"FAIL: Bad target count: " + nt);
2064 continue;
2065 }
2066 for (int32_t j=0; j<nt; ++j) {
2067 UnicodeString target;
2068 Transliterator::getAvailableTarget(j, source, target);
2069 logln((UnicodeString)" " + j + ": " + target);
2070 if (target.length() == 0) {
2071 errln("FAIL: empty target");
2072 continue;
2073 }
2074 int32_t nv = Transliterator::countAvailableVariants(source, target);
2075 if (nv < 0 || nv > 255) {
2076 errln((UnicodeString)"FAIL: Bad variant count: " + nv);
2077 continue;
2078 }
2079 for (int32_t k=0; k<nv; ++k) {
2080 UnicodeString variant;
2081 Transliterator::getAvailableVariant(k, source, target, variant);
2082 if (variant.length() == 0) {
2083 logln((UnicodeString)" " + k + ": <empty>");
2084 } else {
2085 logln((UnicodeString)" " + k + ": " + variant);
2086 }
2087 }
2088 }
2089 }
2090
2091 // Test registration
2092 const char* IDS[] = { "Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2093 const char* FULL_IDS[] = { "Any-Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2094 const char* SOURCES[] = { nullptr, "Seoridf", "Oewoir" };
2095 for (i=0; i<3; ++i) {
2096 Transliterator *t = new TestTrans(IDS[i]);
2097 if (t == 0) {
2098 errln("FAIL: out of memory");
2099 return;
2100 }
2101 if (t->getID() != IDS[i]) {
2102 errln((UnicodeString)"FAIL: ID mismatch for " + IDS[i]);
2103 delete t;
2104 return;
2105 }
2106 Transliterator::registerInstance(t);
2107 UErrorCode status = U_ZERO_ERROR;
2108 t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2109 if (t == nullptr) {
2110 errln((UnicodeString)"FAIL: Registration/creation failed for ID " +
2111 IDS[i]);
2112 } else {
2113 logln((UnicodeString)"Ok: Registration/creation succeeded for ID " +
2114 IDS[i]);
2115 delete t;
2116 }
2117 Transliterator::unregister(IDS[i]);
2118 t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2119 if (t != nullptr) {
2120 errln((UnicodeString)"FAIL: Unregistration failed for ID " +
2121 IDS[i]);
2122 delete t;
2123 }
2124 }
2125
2126 // Make sure getAvailable API reflects removal
2127 int32_t n = Transliterator::countAvailableIDs();
2128 logln((UnicodeString)"countAvailableIDs at end: " + n);
2129 for (i=0; i<n; ++i) {
2130 UnicodeString id = Transliterator::getAvailableID(i);
2131 for (j=0; j<3; ++j) {
2132 if (id.caseCompare(FULL_IDS[j],0)==0) {
2133 errln((UnicodeString)"FAIL: unregister(" + id + ") failed");
2134 }
2135 }
2136 }
2137 n = Transliterator::countAvailableTargets("Any");
2138 logln((UnicodeString)"countAvailableTargets(\"Any\") at end: " + n);
2139 for (i=0; i<n; ++i) {
2140 UnicodeString t;
2141 Transliterator::getAvailableTarget(i, "Any", t);
2142 if (t.caseCompare(IDS[0],0)==0) {
2143 errln((UnicodeString)"FAIL: unregister(Any-" + t + ") failed");
2144 }
2145 }
2146 n = Transliterator::countAvailableSources();
2147 logln((UnicodeString)"countAvailableSources at end: " + n);
2148 for (i=0; i<n; ++i) {
2149 UnicodeString s;
2150 Transliterator::getAvailableSource(i, s);
2151 for (j=0; j<3; ++j) {
2152 if (SOURCES[j] == nullptr) continue;
2153 if (s.caseCompare(SOURCES[j],0)==0) {
2154 if (j!=2 || !logKnownIssue("21911", "ICU4C cannot create inverse of (or unregister) Any-Xxxx/Variant transform created from both-direction transform")) {
2155 errln((UnicodeString)"FAIL: unregister(" + s + "-*) failed");
2156 }
2157 }
2158 }
2159 }
2160 }
2161
2162 /**
2163 * Test inverse of Greek-Latin; Title()
2164 */
TestCompoundInverse()2165 void TransliteratorTest::TestCompoundInverse() {
2166 UParseError parseError;
2167 UErrorCode status = U_ZERO_ERROR;
2168 Transliterator *t = Transliterator::createInstance
2169 ("Greek-Latin; Title()", UTRANS_REVERSE,parseError, status);
2170 if (t == 0) {
2171 dataerrln("FAIL: createInstance - %s", u_errorName(status));
2172 return;
2173 }
2174 UnicodeString exp("(Title);Latin-Greek");
2175 if (t->getID() == exp) {
2176 logln("Ok: inverse of \"Greek-Latin; Title()\" is \"" +
2177 t->getID());
2178 } else {
2179 errln("FAIL: inverse of \"Greek-Latin; Title()\" is \"" +
2180 t->getID() + "\", expected \"" + exp + "\"");
2181 }
2182 delete t;
2183 }
2184
2185 /**
2186 * Test NFD chaining with RBT
2187 */
TestNFDChainRBT()2188 void TransliteratorTest::TestNFDChainRBT() {
2189 UParseError pe;
2190 UErrorCode ec = U_ZERO_ERROR;
2191 Transliterator* t = Transliterator::createFromRules(
2192 "TEST", "::NFD; aa > Q; a > q;",
2193 UTRANS_FORWARD, pe, ec);
2194 if (t == nullptr || U_FAILURE(ec)) {
2195 dataerrln("FAIL: Transliterator::createFromRules failed with %s", u_errorName(ec));
2196 return;
2197 }
2198 expect(*t, "aa", "Q");
2199 delete t;
2200
2201 // TEMPORARY TESTS -- BEING DEBUGGED
2202 //=- UnicodeString s, s2;
2203 //=- t = Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, pe, ec);
2204 //=- s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2205 //=- s2 = CharsToUnicodeString("\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D");
2206 //=- expect(*t, s, s2);
2207 //=- delete t;
2208 //=-
2209 //=- t = Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2210 //=- expect(*t, s2, s);
2211 //=- delete t;
2212 //=-
2213 //=- t = Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2214 //=- s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2215 //=- expect(*t, s, s);
2216 //=- delete t;
2217
2218 // const char* source[] = {
2219 // /*
2220 // "\\u015Br\\u012Bmad",
2221 // "bhagavadg\\u012Bt\\u0101",
2222 // "adhy\\u0101ya",
2223 // "arjuna",
2224 // "vi\\u1E63\\u0101da",
2225 // "y\\u014Dga",
2226 // "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2227 // "uv\\u0101cr\\u0325",
2228 // */
2229 // "rmk\\u1E63\\u0113t",
2230 // //"dharmak\\u1E63\\u0113tr\\u0113",
2231 // /*
2232 // "kuruk\\u1E63\\u0113tr\\u0113",
2233 // "samav\\u0113t\\u0101",
2234 // "yuyutsava-\\u1E25",
2235 // "m\\u0101mak\\u0101-\\u1E25",
2236 // // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2237 // "kimakurvata",
2238 // "san\\u0304java",
2239 // */
2240 //
2241 // 0
2242 // };
2243 // const char* expected[] = {
2244 // /*
2245 // "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2246 // "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2247 // "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2248 // "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2249 // "\\u0935\\u093f\\u0937\\u093e\\u0926",
2250 // "\\u092f\\u094b\\u0917",
2251 // "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2252 // "\\u0909\\u0935\\u093E\\u091A\\u0943",
2253 // */
2254 // "\\u0927",
2255 // //"\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2256 // /*
2257 // "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2258 // "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2259 // "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2260 // "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2261 // // "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2262 // "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2263 // "\\u0938\\u0902\\u091c\\u0935",
2264 // */
2265 // 0
2266 // };
2267 // UErrorCode status = U_ZERO_ERROR;
2268 // UParseError parseError;
2269 // UnicodeString message;
2270 // Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2271 // Transliterator* devToLatinToDev=Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2272 // if(U_FAILURE(status)){
2273 // errln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2274 // errln("PreContext: " + prettify(parseError.preContext) + "PostContext: " + prettify( parseError.postContext) );
2275 // delete latinToDevToLatin;
2276 // delete devToLatinToDev;
2277 // return;
2278 // }
2279 // UnicodeString gotResult;
2280 // for(int i= 0; source[i] != 0; i++){
2281 // gotResult = source[i];
2282 // expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2283 // expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2284 // }
2285 // delete latinToDevToLatin;
2286 // delete devToLatinToDev;
2287 }
2288
2289 /**
2290 * Inverse of "Null" should be "Null". (J21)
2291 */
TestNullInverse()2292 void TransliteratorTest::TestNullInverse() {
2293 UParseError pe;
2294 UErrorCode ec = U_ZERO_ERROR;
2295 Transliterator *t = Transliterator::createInstance("Null", UTRANS_FORWARD, pe, ec);
2296 if (t == 0 || U_FAILURE(ec)) {
2297 errln("FAIL: createInstance");
2298 return;
2299 }
2300 Transliterator *u = t->createInverse(ec);
2301 if (u == 0 || U_FAILURE(ec)) {
2302 errln("FAIL: createInverse");
2303 delete t;
2304 return;
2305 }
2306 if (u->getID() != "Null") {
2307 errln("FAIL: Inverse of Null should be Null");
2308 }
2309 delete t;
2310 delete u;
2311 }
2312
2313 /**
2314 * Check ID of inverse of alias. (J22)
2315 */
TestAliasInverseID()2316 void TransliteratorTest::TestAliasInverseID() {
2317 UnicodeString ID("Latin-Hangul", ""); // This should be any alias ID with an inverse
2318 UParseError pe;
2319 UErrorCode ec = U_ZERO_ERROR;
2320 Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2321 if (t == 0 || U_FAILURE(ec)) {
2322 dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2323 return;
2324 }
2325 Transliterator *u = t->createInverse(ec);
2326 if (u == 0 || U_FAILURE(ec)) {
2327 errln("FAIL: createInverse");
2328 delete t;
2329 return;
2330 }
2331 UnicodeString exp = "Hangul-Latin";
2332 UnicodeString got = u->getID();
2333 if (got != exp) {
2334 errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2335 ", expected " + exp);
2336 }
2337 delete t;
2338 delete u;
2339 }
2340
2341 /**
2342 * Test IDs of inverses of compound transliterators. (J20)
2343 */
TestCompoundInverseID()2344 void TransliteratorTest::TestCompoundInverseID() {
2345 UnicodeString ID = "Latin-Jamo;NFC(NFD)";
2346 UParseError pe;
2347 UErrorCode ec = U_ZERO_ERROR;
2348 Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2349 if (t == 0 || U_FAILURE(ec)) {
2350 dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2351 return;
2352 }
2353 Transliterator *u = t->createInverse(ec);
2354 if (u == 0 || U_FAILURE(ec)) {
2355 errln("FAIL: createInverse");
2356 delete t;
2357 return;
2358 }
2359 UnicodeString exp = "NFD(NFC);Jamo-Latin";
2360 UnicodeString got = u->getID();
2361 if (got != exp) {
2362 errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2363 ", expected " + exp);
2364 }
2365 delete t;
2366 delete u;
2367 }
2368
2369 /**
2370 * Test undefined variable.
2371
2372 */
TestUndefinedVariable()2373 void TransliteratorTest::TestUndefinedVariable() {
2374 UnicodeString rule = "$initial } a <> \\u1161;";
2375 UParseError pe;
2376 UErrorCode ec = U_ZERO_ERROR;
2377 Transliterator *t = Transliterator::createFromRules("<ID>", rule, UTRANS_FORWARD, pe, ec);
2378 delete t;
2379 if (U_FAILURE(ec)) {
2380 logln((UnicodeString)"OK: Got exception for " + rule + ", as expected: " +
2381 u_errorName(ec));
2382 return;
2383 }
2384 errln((UnicodeString)"Fail: bogus rule " + rule + " compiled with error " +
2385 u_errorName(ec));
2386 }
2387
2388 /**
2389 * Test empty context.
2390 */
TestEmptyContext()2391 void TransliteratorTest::TestEmptyContext() {
2392 expect(" { a } > b;", "xay a ", "xby b ");
2393 }
2394
2395 /**
2396 * Test compound filter ID syntax
2397 */
TestCompoundFilterID()2398 void TransliteratorTest::TestCompoundFilterID() {
2399 static const char* DATA[] = {
2400 // Col. 1 = ID or rule set (latter must start with #)
2401
2402 // = columns > 1 are null if expect col. 1 to be illegal =
2403
2404 // Col. 2 = direction, "F..." or "R..."
2405 // Col. 3 = source string
2406 // Col. 4 = exp result
2407
2408 "[abc]; [abc]", nullptr, nullptr, nullptr, // multiple filters
2409 "Latin-Greek; [abc];", nullptr, nullptr, nullptr, // misplaced filter
2410 "[b]; Latin-Greek; Upper; ([xyz])", "F", "abc", "a\\u0392c",
2411 "[b]; (Lower); Latin-Greek; Upper(); ([\\u0392])", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2412 "#\n::[b]; ::Latin-Greek; ::Upper; ::([xyz]);", "F", "abc", "a\\u0392c",
2413 "#\n::[b]; ::(Lower); ::Latin-Greek; ::Upper(); ::([\\u0392]);", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2414 nullptr,
2415 };
2416
2417 for (int32_t i=0; DATA[i]; i+=4) {
2418 UnicodeString id = CharsToUnicodeString(DATA[i]);
2419 UTransDirection direction = (DATA[i+1] != nullptr && DATA[i+1][0] == 'R') ?
2420 UTRANS_REVERSE : UTRANS_FORWARD;
2421 UnicodeString source;
2422 UnicodeString exp;
2423 if (DATA[i+2] != nullptr) {
2424 source = CharsToUnicodeString(DATA[i+2]);
2425 exp = CharsToUnicodeString(DATA[i+3]);
2426 }
2427 UBool expOk = (DATA[i+1] != nullptr);
2428 LocalPointer<Transliterator> t;
2429 UParseError pe;
2430 UErrorCode ec = U_ZERO_ERROR;
2431 if (id.charAt(0) == 0x23/*#*/) {
2432 t.adoptInstead(Transliterator::createFromRules("ID", id, direction, pe, ec));
2433 } else {
2434 t.adoptInstead(Transliterator::createInstance(id, direction, pe, ec));
2435 }
2436 UBool ok = (t.isValid() && U_SUCCESS(ec));
2437 UnicodeString transID;
2438 if (t.isValid()) {
2439 transID = t->getID();
2440 }
2441 else {
2442 transID = UnicodeString("nullptr", "");
2443 }
2444 if (ok == expOk) {
2445 logln((UnicodeString)"Ok: " + id + " => " + transID + ", " +
2446 u_errorName(ec));
2447 if (source.length() != 0) {
2448 expect(*t, source, exp);
2449 }
2450 } else {
2451 dataerrln((UnicodeString)"FAIL: " + id + " => " + transID + ", " +
2452 u_errorName(ec));
2453 }
2454 }
2455 }
2456
2457 /**
2458 * Test new property set syntax
2459 */
TestPropertySet()2460 void TransliteratorTest::TestPropertySet() {
2461 expect(UNICODE_STRING_SIMPLE("a>A; \\p{Lu}>x; \\p{ANY}>y;"), "abcDEF", "Ayyxxx");
2462 expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
2463 "[ a stitch ]\n[ in time ]\r[ saves 9]");
2464 }
2465
2466 /**
2467 * Test various failure points of the new 2.0 engine.
2468 */
TestNewEngine()2469 void TransliteratorTest::TestNewEngine() {
2470 UParseError pe;
2471 UErrorCode ec = U_ZERO_ERROR;
2472 Transliterator *t = Transliterator::createInstance("Latin-Hiragana", UTRANS_FORWARD, pe, ec);
2473 if (t == 0 || U_FAILURE(ec)) {
2474 dataerrln("FAIL: createInstance Latin-Hiragana - %s", u_errorName(ec));
2475 return;
2476 }
2477 // Katakana should be untouched
2478 expect(*t, CharsToUnicodeString("a\\u3042\\u30A2"),
2479 CharsToUnicodeString("\\u3042\\u3042\\u30A2"));
2480
2481 delete t;
2482
2483 #if 1
2484 // This test will only work if Transliterator.ROLLBACK is
2485 // true. Otherwise, this test will fail, revealing a
2486 // limitation of global filters in incremental mode.
2487 Transliterator *a =
2488 Transliterator::createFromRules("a_to_A", "a > A;", UTRANS_FORWARD, pe, ec);
2489 Transliterator *A =
2490 Transliterator::createFromRules("A_to_b", "A > b;", UTRANS_FORWARD, pe, ec);
2491 if (U_FAILURE(ec)) {
2492 delete a;
2493 delete A;
2494 return;
2495 }
2496
2497 Transliterator* array[3];
2498 array[0] = a;
2499 array[1] = Transliterator::createInstance("NFD", UTRANS_FORWARD, pe, ec);
2500 array[2] = A;
2501 if (U_FAILURE(ec)) {
2502 errln("FAIL: createInstance NFD");
2503 delete a;
2504 delete A;
2505 delete array[1];
2506 return;
2507 }
2508
2509 t = new CompoundTransliterator(array, 3, new UnicodeSet("[:Ll:]", ec));
2510 if (U_FAILURE(ec)) {
2511 errln("FAIL: UnicodeSet constructor");
2512 delete a;
2513 delete A;
2514 delete array[1];
2515 delete t;
2516 return;
2517 }
2518
2519 expect(*t, "aAaA", "bAbA");
2520
2521 assertTrue("countElements", t->countElements() == 3);
2522 assertEquals("getElement(0)", t->getElement(0, ec).getID(), "a_to_A");
2523 assertEquals("getElement(1)", t->getElement(1, ec).getID(), "NFD");
2524 assertEquals("getElement(2)", t->getElement(2, ec).getID(), "A_to_b");
2525 assertSuccess("getElement", ec);
2526
2527 delete a;
2528 delete A;
2529 delete array[1];
2530 delete t;
2531 #endif
2532
2533 expect("$smooth = x; $macron = q; [:^L:] { ([aeiouyAEIOUY] $macron?) } [^aeiouyAEIOUY$smooth$macron] > | $1 $smooth ;",
2534 "a",
2535 "ax");
2536
2537 UnicodeString gr = CharsToUnicodeString(
2538 "$ddot = \\u0308 ;"
2539 "$lcgvowel = [\\u03b1\\u03b5\\u03b7\\u03b9\\u03bf\\u03c5\\u03c9] ;"
2540 "$rough = \\u0314 ;"
2541 "($lcgvowel+ $ddot?) $rough > h | $1 ;"
2542 "\\u03b1 <> a ;"
2543 "$rough <> h ;");
2544
2545 expect(gr, CharsToUnicodeString("\\u03B1\\u0314"), "ha");
2546 }
2547
2548 /**
2549 * Test quantified segment behavior. We want:
2550 * ([abc])+ > x $1 x; applied to "cba" produces "xax"
2551 */
TestQuantifiedSegment()2552 void TransliteratorTest::TestQuantifiedSegment() {
2553 // The normal case
2554 expect("([abc]+) > x $1 x;", "cba", "xcbax");
2555
2556 // The tricky case; the quantifier is around the segment
2557 expect("([abc])+ > x $1 x;", "cba", "xax");
2558
2559 // Tricky case in reverse direction
2560 expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
2561
2562 // Check post-context segment
2563 expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
2564
2565 // Test toRule/toPattern for non-quantified segment.
2566 // Careful with spacing here.
2567 UnicodeString r("([a-c]){q} > x $1 x;");
2568 UParseError pe;
2569 UErrorCode ec = U_ZERO_ERROR;
2570 Transliterator* t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2571 if (U_FAILURE(ec)) {
2572 errln("FAIL: createFromRules");
2573 delete t;
2574 return;
2575 }
2576 UnicodeString rr;
2577 t->toRules(rr, true);
2578 if (r != rr) {
2579 errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2580 } else {
2581 logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2582 }
2583 delete t;
2584
2585 // Test toRule/toPattern for quantified segment.
2586 // Careful with spacing here.
2587 r = "([a-c])+{q} > x $1 x;";
2588 t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2589 if (U_FAILURE(ec)) {
2590 errln("FAIL: createFromRules");
2591 delete t;
2592 return;
2593 }
2594 t->toRules(rr, true);
2595 if (r != rr) {
2596 errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2597 } else {
2598 logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2599 }
2600 delete t;
2601 }
2602
2603 //======================================================================
2604 // Ram's tests
2605 //======================================================================
TestDevanagariLatinRT()2606 void TransliteratorTest::TestDevanagariLatinRT(){
2607 const int MAX_LEN= 52;
2608 const char* const source[MAX_LEN] = {
2609 "bh\\u0101rata",
2610 "kra",
2611 "k\\u1E63a",
2612 "khra",
2613 "gra",
2614 "\\u1E45ra",
2615 "cra",
2616 "chra",
2617 "j\\u00F1a",
2618 "jhra",
2619 "\\u00F1ra",
2620 "\\u1E6Dya",
2621 "\\u1E6Dhra",
2622 "\\u1E0Dya",
2623 //"r\\u0323ya", // \u095c is not valid in Devanagari
2624 "\\u1E0Dhya",
2625 "\\u1E5Bhra",
2626 "\\u1E47ra",
2627 "tta",
2628 "thra",
2629 "dda",
2630 "dhra",
2631 "nna",
2632 "pra",
2633 "phra",
2634 "bra",
2635 "bhra",
2636 "mra",
2637 "\\u1E49ra",
2638 //"l\\u0331ra",
2639 "yra",
2640 "\\u1E8Fra",
2641 //"l-",
2642 "vra",
2643 "\\u015Bra",
2644 "\\u1E63ra",
2645 "sra",
2646 "hma",
2647 "\\u1E6D\\u1E6Da",
2648 "\\u1E6D\\u1E6Dha",
2649 "\\u1E6Dh\\u1E6Dha",
2650 "\\u1E0D\\u1E0Da",
2651 "\\u1E0D\\u1E0Dha",
2652 "\\u1E6Dya",
2653 "\\u1E6Dhya",
2654 "\\u1E0Dya",
2655 "\\u1E0Dhya",
2656 // Not roundtrippable --
2657 // \\u0939\\u094d\\u094d\\u092E - hma
2658 // \\u0939\\u094d\\u092E - hma
2659 // CharsToUnicodeString("hma"),
2660 "hya",
2661 "\\u015Br\\u0325",
2662 "\\u015Bca",
2663 "\\u0115",
2664 "san\\u0304j\\u012Bb s\\u0113nagupta",
2665 "\\u0101nand vaddir\\u0101ju",
2666 "\\u0101",
2667 "a"
2668 };
2669 const char* const expected[MAX_LEN] = {
2670 "\\u092D\\u093E\\u0930\\u0924", /* bha\\u0304rata */
2671 "\\u0915\\u094D\\u0930", /* kra */
2672 "\\u0915\\u094D\\u0937", /* ks\\u0323a */
2673 "\\u0916\\u094D\\u0930", /* khra */
2674 "\\u0917\\u094D\\u0930", /* gra */
2675 "\\u0919\\u094D\\u0930", /* n\\u0307ra */
2676 "\\u091A\\u094D\\u0930", /* cra */
2677 "\\u091B\\u094D\\u0930", /* chra */
2678 "\\u091C\\u094D\\u091E", /* jn\\u0303a */
2679 "\\u091D\\u094D\\u0930", /* jhra */
2680 "\\u091E\\u094D\\u0930", /* n\\u0303ra */
2681 "\\u091F\\u094D\\u092F", /* t\\u0323ya */
2682 "\\u0920\\u094D\\u0930", /* t\\u0323hra */
2683 "\\u0921\\u094D\\u092F", /* d\\u0323ya */
2684 //"\\u095C\\u094D\\u092F", /* r\\u0323ya */ // \u095c is not valid in Devanagari
2685 "\\u0922\\u094D\\u092F", /* d\\u0323hya */
2686 "\\u0922\\u093C\\u094D\\u0930", /* r\\u0323hra */
2687 "\\u0923\\u094D\\u0930", /* n\\u0323ra */
2688 "\\u0924\\u094D\\u0924", /* tta */
2689 "\\u0925\\u094D\\u0930", /* thra */
2690 "\\u0926\\u094D\\u0926", /* dda */
2691 "\\u0927\\u094D\\u0930", /* dhra */
2692 "\\u0928\\u094D\\u0928", /* nna */
2693 "\\u092A\\u094D\\u0930", /* pra */
2694 "\\u092B\\u094D\\u0930", /* phra */
2695 "\\u092C\\u094D\\u0930", /* bra */
2696 "\\u092D\\u094D\\u0930", /* bhra */
2697 "\\u092E\\u094D\\u0930", /* mra */
2698 "\\u0929\\u094D\\u0930", /* n\\u0331ra */
2699 //"\\u0934\\u094D\\u0930", /* l\\u0331ra */
2700 "\\u092F\\u094D\\u0930", /* yra */
2701 "\\u092F\\u093C\\u094D\\u0930", /* y\\u0307ra */
2702 //"l-",
2703 "\\u0935\\u094D\\u0930", /* vra */
2704 "\\u0936\\u094D\\u0930", /* s\\u0301ra */
2705 "\\u0937\\u094D\\u0930", /* s\\u0323ra */
2706 "\\u0938\\u094D\\u0930", /* sra */
2707 "\\u0939\\u094d\\u092E", /* hma */
2708 "\\u091F\\u094D\\u091F", /* t\\u0323t\\u0323a */
2709 "\\u091F\\u094D\\u0920", /* t\\u0323t\\u0323ha */
2710 "\\u0920\\u094D\\u0920", /* t\\u0323ht\\u0323ha*/
2711 "\\u0921\\u094D\\u0921", /* d\\u0323d\\u0323a */
2712 "\\u0921\\u094D\\u0922", /* d\\u0323d\\u0323ha */
2713 "\\u091F\\u094D\\u092F", /* t\\u0323ya */
2714 "\\u0920\\u094D\\u092F", /* t\\u0323hya */
2715 "\\u0921\\u094D\\u092F", /* d\\u0323ya */
2716 "\\u0922\\u094D\\u092F", /* d\\u0323hya */
2717 // "hma", /* hma */
2718 "\\u0939\\u094D\\u092F", /* hya */
2719 "\\u0936\\u0943", /* s\\u0301r\\u0325a */
2720 "\\u0936\\u094D\\u091A", /* s\\u0301ca */
2721 "\\u090d", /* e\\u0306 */
2722 "\\u0938\\u0902\\u091C\\u0940\\u092C\\u094D \\u0938\\u0947\\u0928\\u0917\\u0941\\u092A\\u094D\\u0924",
2723 "\\u0906\\u0928\\u0902\\u0926\\u094D \\u0935\\u0926\\u094D\\u0926\\u093F\\u0930\\u093E\\u091C\\u0941",
2724 "\\u0906",
2725 "\\u0905",
2726 };
2727 UErrorCode status = U_ZERO_ERROR;
2728 UParseError parseError;
2729 UnicodeString message;
2730 Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2731 Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2732 if(U_FAILURE(status)){
2733 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2734 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2735 return;
2736 }
2737 UnicodeString gotResult;
2738 for(int i= 0; i<MAX_LEN; i++){
2739 gotResult = source[i];
2740 expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2741 expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2742 }
2743 delete latinToDev;
2744 delete devToLatin;
2745 }
2746
TestTeluguLatinRT()2747 void TransliteratorTest::TestTeluguLatinRT(){
2748 const int MAX_LEN=10;
2749 const char* const source[MAX_LEN] = {
2750 "raghur\\u0101m vi\\u015Bvan\\u0101dha", /* Raghuram Viswanadha */
2751 "\\u0101nand vaddir\\u0101ju", /* Anand Vaddiraju */
2752 "r\\u0101j\\u012Bv ka\\u015Barab\\u0101da", /* Rajeev Kasarabada */
2753 "san\\u0304j\\u012Bv ka\\u015Barab\\u0101da", /* sanjeev kasarabada */
2754 "san\\u0304j\\u012Bb sen'gupta", /* sanjib sengupata */
2755 "amar\\u0113ndra hanum\\u0101nula", /* Amarendra hanumanula */
2756 "ravi kum\\u0101r vi\\u015Bvan\\u0101dha", /* Ravi Kumar Viswanadha */
2757 "\\u0101ditya kandr\\u0113gula", /* Aditya Kandregula */
2758 "\\u015Br\\u012Bdhar ka\\u1E47\\u1E6Dama\\u015Be\\u1E6D\\u1E6Di",/* Shridhar Kantamsetty */
2759 "m\\u0101dhav de\\u015Be\\u1E6D\\u1E6Di" /* Madhav Desetty */
2760 };
2761
2762 const char* const expected[MAX_LEN] = {
2763 "\\u0c30\\u0c18\\u0c41\\u0c30\\u0c3e\\u0c2e\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2764 "\\u0c06\\u0c28\\u0c02\\u0c26\\u0c4d \\u0C35\\u0C26\\u0C4D\\u0C26\\u0C3F\\u0C30\\u0C3E\\u0C1C\\u0C41",
2765 "\\u0c30\\u0c3e\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2766 "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2767 "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c2c\\u0c4d \\u0c38\\u0c46\\u0c28\\u0c4d\\u0c17\\u0c41\\u0c2a\\u0c4d\\u0c24",
2768 "\\u0c05\\u0c2e\\u0c30\\u0c47\\u0c02\\u0c26\\u0c4d\\u0c30 \\u0c39\\u0c28\\u0c41\\u0c2e\\u0c3e\\u0c28\\u0c41\\u0c32",
2769 "\\u0c30\\u0c35\\u0c3f \\u0c15\\u0c41\\u0c2e\\u0c3e\\u0c30\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2770 "\\u0c06\\u0c26\\u0c3f\\u0c24\\u0c4d\\u0c2f \\u0C15\\u0C02\\u0C26\\u0C4D\\u0C30\\u0C47\\u0C17\\u0C41\\u0c32",
2771 "\\u0c36\\u0c4d\\u0c30\\u0c40\\u0C27\\u0C30\\u0C4D \\u0c15\\u0c02\\u0c1f\\u0c2e\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2772 "\\u0c2e\\u0c3e\\u0c27\\u0c35\\u0c4d \\u0c26\\u0c46\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2773 };
2774
2775 UErrorCode status = U_ZERO_ERROR;
2776 UParseError parseError;
2777 UnicodeString message;
2778 Transliterator* latinToDev=Transliterator::createInstance("Latin-Telugu", UTRANS_FORWARD, parseError, status);
2779 Transliterator* devToLatin=Transliterator::createInstance("Telugu-Latin", UTRANS_FORWARD, parseError, status);
2780 if(U_FAILURE(status)){
2781 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2782 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2783 return;
2784 }
2785 UnicodeString gotResult;
2786 for(int i= 0; i<MAX_LEN; i++){
2787 gotResult = source[i];
2788 expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2789 expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2790 }
2791 delete latinToDev;
2792 delete devToLatin;
2793 }
2794
TestSanskritLatinRT()2795 void TransliteratorTest::TestSanskritLatinRT(){
2796 const int MAX_LEN =16;
2797 const char* const source[MAX_LEN] = {
2798 "rmk\\u1E63\\u0113t",
2799 "\\u015Br\\u012Bmad",
2800 "bhagavadg\\u012Bt\\u0101",
2801 "adhy\\u0101ya",
2802 "arjuna",
2803 "vi\\u1E63\\u0101da",
2804 "y\\u014Dga",
2805 "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2806 "uv\\u0101cr\\u0325",
2807 "dharmak\\u1E63\\u0113tr\\u0113",
2808 "kuruk\\u1E63\\u0113tr\\u0113",
2809 "samav\\u0113t\\u0101",
2810 "yuyutsava\\u1E25",
2811 "m\\u0101mak\\u0101\\u1E25",
2812 // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2813 "kimakurvata",
2814 "san\\u0304java",
2815 };
2816 const char* const expected[MAX_LEN] = {
2817 "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2818 "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2819 "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2820 "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2821 "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2822 "\\u0935\\u093f\\u0937\\u093e\\u0926",
2823 "\\u092f\\u094b\\u0917",
2824 "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2825 "\\u0909\\u0935\\u093E\\u091A\\u0943",
2826 "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2827 "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2828 "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2829 "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2830 "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2831 //"\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2832 "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2833 "\\u0938\\u0902\\u091c\\u0935",
2834 };
2835 UErrorCode status = U_ZERO_ERROR;
2836 UParseError parseError;
2837 UnicodeString message;
2838 Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2839 Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2840 if(U_FAILURE(status)){
2841 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2842 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2843 return;
2844 }
2845 UnicodeString gotResult;
2846 for(int i= 0; i<MAX_LEN; i++){
2847 gotResult = source[i];
2848 expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2849 expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2850 }
2851 delete latinToDev;
2852 delete devToLatin;
2853 }
2854
2855
TestCompoundLatinRT()2856 void TransliteratorTest::TestCompoundLatinRT(){
2857 const char* const source[] = {
2858 "rmk\\u1E63\\u0113t",
2859 "\\u015Br\\u012Bmad",
2860 "bhagavadg\\u012Bt\\u0101",
2861 "adhy\\u0101ya",
2862 "arjuna",
2863 "vi\\u1E63\\u0101da",
2864 "y\\u014Dga",
2865 "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2866 "uv\\u0101cr\\u0325",
2867 "dharmak\\u1E63\\u0113tr\\u0113",
2868 "kuruk\\u1E63\\u0113tr\\u0113",
2869 "samav\\u0113t\\u0101",
2870 "yuyutsava\\u1E25",
2871 "m\\u0101mak\\u0101\\u1E25",
2872 // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2873 "kimakurvata",
2874 "san\\u0304java"
2875 };
2876 const int MAX_LEN = UPRV_LENGTHOF(source);
2877 const char* const expected[MAX_LEN] = {
2878 "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2879 "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2880 "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2881 "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2882 "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2883 "\\u0935\\u093f\\u0937\\u093e\\u0926",
2884 "\\u092f\\u094b\\u0917",
2885 "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2886 "\\u0909\\u0935\\u093E\\u091A\\u0943",
2887 "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2888 "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2889 "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2890 "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2891 "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2892 // "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2893 "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2894 "\\u0938\\u0902\\u091c\\u0935"
2895 };
2896 if(MAX_LEN != UPRV_LENGTHOF(expected)) {
2897 errln("error in TestCompoundLatinRT: source[] and expected[] have different lengths!");
2898 return;
2899 }
2900
2901 UErrorCode status = U_ZERO_ERROR;
2902 UParseError parseError;
2903 UnicodeString message;
2904 Transliterator* devToLatinToDev =Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2905 Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2906 Transliterator* devToTelToDev =Transliterator::createInstance("Devanagari-Telugu;Telugu-Devanagari", UTRANS_FORWARD, parseError, status);
2907 Transliterator* latinToTelToLatin=Transliterator::createInstance("Latin-Telugu;Telugu-Latin", UTRANS_FORWARD, parseError, status);
2908
2909 if(U_FAILURE(status)){
2910 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2911 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2912 return;
2913 }
2914 UnicodeString gotResult;
2915 for(int i= 0; i<MAX_LEN; i++){
2916 gotResult = source[i];
2917 expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2918 expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2919 expect(*latinToTelToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2920
2921 }
2922 delete(latinToDevToLatin);
2923 delete(devToLatinToDev);
2924 delete(devToTelToDev);
2925 delete(latinToTelToLatin);
2926 }
2927
2928 /**
2929 * Test Gurmukhi-Devanagari Tippi and Bindi
2930 */
TestGurmukhiDevanagari()2931 void TransliteratorTest::TestGurmukhiDevanagari(){
2932 // the rule says:
2933 // (\u0902) (when preceded by vowel) ---> (\u0A02)
2934 // (\u0902) (when preceded by consonant) ---> (\u0A70)
2935 UErrorCode status = U_ZERO_ERROR;
2936 UnicodeSet vowel(UnicodeString("[\\u0905-\\u090A \\u090F\\u0910\\u0913\\u0914 \\u093e-\\u0942\\u0947\\u0948\\u094B\\u094C\\u094D]", -1, US_INV).unescape(), status);
2937 UnicodeSet non_vowel(UnicodeString("[\\u0915-\\u0928\\u092A-\\u0930]", -1, US_INV).unescape(), status);
2938 UParseError parseError;
2939
2940 UnicodeSetIterator vIter(vowel);
2941 UnicodeSetIterator nvIter(non_vowel);
2942 Transliterator* trans = Transliterator::createInstance("Devanagari-Gurmukhi",UTRANS_FORWARD, parseError, status);
2943 if(U_FAILURE(status)) {
2944 dataerrln("Error creating transliterator %s", u_errorName(status));
2945 delete trans;
2946 return;
2947 }
2948 UnicodeString src (" \\u0902", -1, US_INV);
2949 UnicodeString expected(" \\u0A02", -1, US_INV);
2950 src = src.unescape();
2951 expected= expected.unescape();
2952
2953 while(vIter.next()){
2954 src.setCharAt(0,(char16_t) vIter.getCodepoint());
2955 expected.setCharAt(0,(char16_t) (vIter.getCodepoint()+0x0100));
2956 expect(*trans,src,expected);
2957 }
2958
2959 expected.setCharAt(1,0x0A70);
2960 while(nvIter.next()){
2961 //src.setCharAt(0,(char) nvIter.codepoint);
2962 src.setCharAt(0,(char16_t)nvIter.getCodepoint());
2963 expected.setCharAt(0,(char16_t) (nvIter.getCodepoint()+0x0100));
2964 expect(*trans,src,expected);
2965 }
2966 delete trans;
2967 }
2968 /**
2969 * Test instantiation from a locale.
2970 */
TestLocaleInstantiation()2971 void TransliteratorTest::TestLocaleInstantiation() {
2972 UParseError pe;
2973 UErrorCode ec = U_ZERO_ERROR;
2974 Transliterator *t = Transliterator::createInstance("ru_RU-Latin", UTRANS_FORWARD, pe, ec);
2975 if (U_FAILURE(ec)) {
2976 dataerrln("FAIL: createInstance(ru_RU-Latin) - %s", u_errorName(ec));
2977 delete t;
2978 return;
2979 }
2980 expect(*t, CharsToUnicodeString("\\u0430"), "a");
2981 delete t;
2982
2983 t = Transliterator::createInstance("en-el", UTRANS_FORWARD, pe, ec);
2984 if (U_FAILURE(ec)) {
2985 errln("FAIL: createInstance(en-el)");
2986 delete t;
2987 return;
2988 }
2989 expect(*t, "a", CharsToUnicodeString("\\u03B1"));
2990 delete t;
2991 }
2992
2993 /**
2994 * Test title case handling of accent (should ignore accents)
2995 */
TestTitleAccents()2996 void TransliteratorTest::TestTitleAccents() {
2997 UParseError pe;
2998 UErrorCode ec = U_ZERO_ERROR;
2999 Transliterator *t = Transliterator::createInstance("Title", UTRANS_FORWARD, pe, ec);
3000 if (U_FAILURE(ec)) {
3001 errln("FAIL: createInstance(Title)");
3002 delete t;
3003 return;
3004 }
3005 expect(*t, CharsToUnicodeString("a\\u0300b can't abe"), CharsToUnicodeString("A\\u0300b Can't Abe"));
3006 delete t;
3007 }
3008
3009 /**
3010 * Basic test of a locale resource based rule.
3011 */
TestLocaleResource()3012 void TransliteratorTest::TestLocaleResource() {
3013 const char* DATA[] = {
3014 // id from to
3015 //"Latin-Greek/UNGEGN", "b", "\\u03bc\\u03c0",
3016 "Latin-el", "b", "\\u03bc\\u03c0",
3017 "Latin-Greek", "b", "\\u03B2",
3018 "Greek-Latin/UNGEGN", "\\u03B2", "v",
3019 "el-Latin", "\\u03B2", "v",
3020 "Greek-Latin", "\\u03B2", "b",
3021 };
3022 const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3023 for (int32_t i=0; i<DATA_length; i+=3) {
3024 UParseError pe;
3025 UErrorCode ec = U_ZERO_ERROR;
3026 Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, pe, ec);
3027 if (U_FAILURE(ec)) {
3028 dataerrln((UnicodeString)"FAIL: createInstance(" + DATA[i] + ") - " + u_errorName(ec));
3029 delete t;
3030 continue;
3031 }
3032 expect(*t, CharsToUnicodeString(DATA[i+1]),
3033 CharsToUnicodeString(DATA[i+2]));
3034 delete t;
3035 }
3036 }
3037
3038 /**
3039 * Make sure parse errors reference the right line.
3040 */
TestParseError()3041 void TransliteratorTest::TestParseError() {
3042 static const char* rule =
3043 "a > b;\n"
3044 "# more stuff\n"
3045 "d << b;";
3046 UErrorCode ec = U_ZERO_ERROR;
3047 UParseError pe;
3048 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3049 delete t;
3050 if (U_FAILURE(ec)) {
3051 UnicodeString err(pe.preContext);
3052 err.append((char16_t)124/*|*/).append(pe.postContext);
3053 if (err.indexOf("d << b") >= 0) {
3054 logln("Ok: " + err);
3055 } else {
3056 errln("FAIL: " + err);
3057 }
3058 }
3059 else {
3060 errln("FAIL: no syntax error");
3061 }
3062 static const char* maskingRule =
3063 "a>x;\n"
3064 "# more stuff\n"
3065 "ab>y;";
3066 ec = U_ZERO_ERROR;
3067 delete Transliterator::createFromRules("ID", maskingRule, UTRANS_FORWARD, pe, ec);
3068 if (ec != U_RULE_MASK_ERROR) {
3069 errln("FAIL: returned %s instead of U_RULE_MASK_ERROR", u_errorName(ec));
3070 }
3071 else if (UnicodeString("a > x;") != UnicodeString(pe.preContext)) {
3072 errln("FAIL: did not get expected precontext");
3073 }
3074 else if (UnicodeString("ab > y;") != UnicodeString(pe.postContext)) {
3075 errln("FAIL: did not get expected postcontext");
3076 }
3077 }
3078
3079 /**
3080 * Make sure sets on output are disallowed.
3081 */
TestOutputSet()3082 void TransliteratorTest::TestOutputSet() {
3083 UnicodeString rule = "$set = [a-cm-n]; b > $set;";
3084 UErrorCode ec = U_ZERO_ERROR;
3085 UParseError pe;
3086 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3087 delete t;
3088 if (U_FAILURE(ec)) {
3089 UnicodeString err(pe.preContext);
3090 err.append((char16_t)124/*|*/).append(pe.postContext);
3091 logln("Ok: " + err);
3092 return;
3093 }
3094 errln("FAIL: No syntax error");
3095 }
3096
3097 /**
3098 * Test the use variable range pragma, making sure that use of
3099 * variable range characters is detected and flagged as an error.
3100 */
TestVariableRange()3101 void TransliteratorTest::TestVariableRange() {
3102 UnicodeString rule = "use variable range 0x70 0x72; a > A; b > B; q > Q;";
3103 UErrorCode ec = U_ZERO_ERROR;
3104 UParseError pe;
3105 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3106 delete t;
3107 if (U_FAILURE(ec)) {
3108 UnicodeString err(pe.preContext);
3109 err.append((char16_t)124/*|*/).append(pe.postContext);
3110 logln("Ok: " + err);
3111 return;
3112 }
3113 errln("FAIL: No syntax error");
3114 }
3115
3116 /**
3117 * Test invalid post context error handling
3118 */
TestInvalidPostContext()3119 void TransliteratorTest::TestInvalidPostContext() {
3120 UnicodeString rule = "a}b{c>d;";
3121 UErrorCode ec = U_ZERO_ERROR;
3122 UParseError pe;
3123 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3124 delete t;
3125 if (U_FAILURE(ec)) {
3126 UnicodeString err(pe.preContext);
3127 err.append((char16_t)124/*|*/).append(pe.postContext);
3128 if (err.indexOf("a}b{c") >= 0) {
3129 logln("Ok: " + err);
3130 } else {
3131 errln("FAIL: " + err);
3132 }
3133 return;
3134 }
3135 errln("FAIL: No syntax error");
3136 }
3137
3138 /**
3139 * Test ID form variants
3140 */
TestIDForms()3141 void TransliteratorTest::TestIDForms() {
3142 const char* DATA[] = {
3143 "NFC", nullptr, "NFD",
3144 "nfd", nullptr, "NFC", // make sure case is ignored
3145 "Any-NFKD", nullptr, "Any-NFKC",
3146 "Null", nullptr, "Null",
3147 "-nfkc", "nfkc", "NFKD",
3148 "-nfkc/", "nfkc", "NFKD",
3149 "Latin-Greek/UNGEGN", nullptr, "Greek-Latin/UNGEGN",
3150 "Greek/UNGEGN-Latin", "Greek-Latin/UNGEGN", "Latin-Greek/UNGEGN",
3151 "Bengali-Devanagari/", "Bengali-Devanagari", "Devanagari-Bengali",
3152 "Source-", nullptr, nullptr,
3153 "Source/Variant-", nullptr, nullptr,
3154 "Source-/Variant", nullptr, nullptr,
3155 "/Variant", nullptr, nullptr,
3156 "/Variant-", nullptr, nullptr,
3157 "-/Variant", nullptr, nullptr,
3158 "-/", nullptr, nullptr,
3159 "-", nullptr, nullptr,
3160 "/", nullptr, nullptr,
3161 };
3162 const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3163
3164 for (int32_t i=0; i<DATA_length; i+=3) {
3165 const char* ID = DATA[i];
3166 const char* expID = DATA[i+1];
3167 const char* expInvID = DATA[i+2];
3168 UBool expValid = (expInvID != nullptr);
3169 if (expID == nullptr) {
3170 expID = ID;
3171 }
3172 UParseError pe;
3173 UErrorCode ec = U_ZERO_ERROR;
3174 Transliterator *t =
3175 Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
3176 if (U_FAILURE(ec)) {
3177 if (!expValid) {
3178 logln((UnicodeString)"Ok: getInstance(" + ID +") => " + u_errorName(ec));
3179 } else {
3180 dataerrln((UnicodeString)"FAIL: Couldn't create " + ID + " - " + u_errorName(ec));
3181 }
3182 delete t;
3183 continue;
3184 }
3185 Transliterator *u = t->createInverse(ec);
3186 if (U_FAILURE(ec)) {
3187 errln((UnicodeString)"FAIL: Couldn't create inverse of " + ID);
3188 delete t;
3189 delete u;
3190 continue;
3191 }
3192 if (t->getID() == expID &&
3193 u->getID() == expInvID) {
3194 logln((UnicodeString)"Ok: " + ID + ".getInverse() => " + expInvID);
3195 } else {
3196 errln((UnicodeString)"FAIL: getInstance(" + ID + ") => " +
3197 t->getID() + " x getInverse() => " + u->getID() +
3198 ", expected " + expInvID);
3199 }
3200 delete t;
3201 delete u;
3202 }
3203 }
3204
3205 static const char16_t SPACE[] = {32,0};
3206 static const char16_t NEWLINE[] = {10,0};
3207 static const char16_t RETURN[] = {13,0};
3208 static const char16_t EMPTY[] = {0};
3209
checkRules(const UnicodeString & label,Transliterator & t2,const UnicodeString & testRulesForward)3210 void TransliteratorTest::checkRules(const UnicodeString& label, Transliterator& t2,
3211 const UnicodeString& testRulesForward) {
3212 UnicodeString rules2; t2.toRules(rules2, true);
3213 //rules2 = TestUtility.replaceAll(rules2, new UnicodeSet("[' '\n\r]"), "");
3214 rules2.findAndReplace(SPACE, EMPTY);
3215 rules2.findAndReplace(NEWLINE, EMPTY);
3216 rules2.findAndReplace(RETURN, EMPTY);
3217
3218 UnicodeString testRules(testRulesForward); testRules.findAndReplace(SPACE, EMPTY);
3219
3220 if (rules2 != testRules) {
3221 errln(label);
3222 logln((UnicodeString)"GENERATED RULES: " + rules2);
3223 logln((UnicodeString)"SHOULD BE: " + testRulesForward);
3224 }
3225 }
3226
3227 /**
3228 * Mark's toRules test.
3229 */
TestToRulesMark()3230 void TransliteratorTest::TestToRulesMark() {
3231 const char* testRules =
3232 "::[[:Latin:][:Mark:]];"
3233 "::NFKD (NFC);"
3234 "::Lower (Lower);"
3235 "a <> \\u03B1;" // alpha
3236 "::NFKC (NFD);"
3237 "::Upper (Lower);"
3238 "::Lower ();"
3239 "::([[:Greek:][:Mark:]]);"
3240 ;
3241 const char* testRulesForward =
3242 "::[[:Latin:][:Mark:]];"
3243 "::NFKD(NFC);"
3244 "::Lower(Lower);"
3245 "a > \\u03B1;"
3246 "::NFKC(NFD);"
3247 "::Upper (Lower);"
3248 "::Lower ();"
3249 ;
3250 const char* testRulesBackward =
3251 "::[[:Greek:][:Mark:]];"
3252 "::Lower (Upper);"
3253 "::NFD(NFKC);"
3254 "\\u03B1 > a;"
3255 "::Lower(Lower);"
3256 "::NFC(NFKD);"
3257 ;
3258 UnicodeString source = CharsToUnicodeString("\\u00E1"); // a-acute
3259 UnicodeString target = CharsToUnicodeString("\\u03AC"); // alpha-acute
3260
3261 UParseError pe;
3262 UErrorCode ec = U_ZERO_ERROR;
3263 LocalPointer<Transliterator> t2(
3264 Transliterator::createFromRules("source-target", UnicodeString(testRules, -1, US_INV), UTRANS_FORWARD, pe, ec));
3265 LocalPointer<Transliterator> t3(
3266 Transliterator::createFromRules("target-source", UnicodeString(testRules, -1, US_INV), UTRANS_REVERSE, pe, ec));
3267
3268 if (U_FAILURE(ec)) {
3269 dataerrln((UnicodeString)"FAIL: createFromRules => " + u_errorName(ec));
3270 return;
3271 }
3272
3273 expect(*t2, source, target);
3274 expect(*t3, target, source);
3275
3276 checkRules("Failed toRules FORWARD", *t2, UnicodeString(testRulesForward, -1, US_INV));
3277 checkRules("Failed toRules BACKWARD", *t3, UnicodeString(testRulesBackward, -1, US_INV));
3278 }
3279
3280 /**
3281 * Test Escape and Unescape transliterators.
3282 */
TestEscape()3283 void TransliteratorTest::TestEscape() {
3284 UParseError pe;
3285 UErrorCode ec;
3286 Transliterator *t;
3287
3288 ec = U_ZERO_ERROR;
3289 t = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, pe, ec);
3290 if (U_FAILURE(ec)) {
3291 errln((UnicodeString)"FAIL: createInstance");
3292 } else {
3293 expect(*t,
3294 UNICODE_STRING_SIMPLE("\\x{40}\\U000000312Q"),
3295 "@12Q");
3296 }
3297 delete t;
3298
3299 ec = U_ZERO_ERROR;
3300 t = Transliterator::createInstance("Any-Hex/C", UTRANS_FORWARD, pe, ec);
3301 if (U_FAILURE(ec)) {
3302 errln((UnicodeString)"FAIL: createInstance");
3303 } else {
3304 expect(*t,
3305 CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3306 UNICODE_STRING_SIMPLE("\\u0041\\U0010BEEF\\uFEED"));
3307 }
3308 delete t;
3309
3310 ec = U_ZERO_ERROR;
3311 t = Transliterator::createInstance("Any-Hex/Java", UTRANS_FORWARD, pe, ec);
3312 if (U_FAILURE(ec)) {
3313 errln((UnicodeString)"FAIL: createInstance");
3314 } else {
3315 expect(*t,
3316 CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3317 UNICODE_STRING_SIMPLE("\\u0041\\uDBEF\\uDEEF\\uFEED"));
3318 }
3319 delete t;
3320
3321 ec = U_ZERO_ERROR;
3322 t = Transliterator::createInstance("Any-Hex/Perl", UTRANS_FORWARD, pe, ec);
3323 if (U_FAILURE(ec)) {
3324 errln((UnicodeString)"FAIL: createInstance");
3325 } else {
3326 expect(*t,
3327 CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3328 UNICODE_STRING_SIMPLE("\\x{41}\\x{10BEEF}\\x{FEED}"));
3329 }
3330 delete t;
3331 }
3332
3333
TestAnchorMasking()3334 void TransliteratorTest::TestAnchorMasking(){
3335 UnicodeString rule ("^a > Q; a > q;");
3336 UErrorCode status= U_ZERO_ERROR;
3337 UParseError parseError;
3338
3339 Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status);
3340 if(U_FAILURE(status)){
3341 errln(UnicodeString("FAIL: ") + "ID" +
3342 ".createFromRules() => bad rules" +
3343 /*", parse error " + parseError.code +*/
3344 ", line " + parseError.line +
3345 ", offset " + parseError.offset +
3346 ", context " + prettify(parseError.preContext, true) +
3347 ", rules: " + prettify(rule, true));
3348 }
3349 delete t;
3350 }
3351
3352 /**
3353 * Make sure display names of variants look reasonable.
3354 */
TestDisplayName()3355 void TransliteratorTest::TestDisplayName() {
3356 #if UCONFIG_NO_FORMATTING
3357 logln("Skipping, UCONFIG_NO_FORMATTING is set\n");
3358 return;
3359 #else
3360 static const char* DATA[] = {
3361 // ID, forward name, reverse name
3362 // Update the text as necessary -- the important thing is
3363 // not the text itself, but how various cases are handled.
3364
3365 // Basic test
3366 "Any-Hex", "Any to Hex Escape", "Hex Escape to Any",
3367
3368 // Variants
3369 "Any-Hex/Perl", "Any to Hex Escape/Perl", "Hex Escape to Any/Perl",
3370
3371 // Target-only IDs
3372 "NFC", "Any to NFC", "Any to NFD",
3373 };
3374
3375 int32_t DATA_length = UPRV_LENGTHOF(DATA);
3376
3377 Locale US("en", "US");
3378
3379 for (int32_t i=0; i<DATA_length; i+=3) {
3380 UnicodeString name;
3381 Transliterator::getDisplayName(DATA[i], US, name);
3382 if (name != DATA[i+1]) {
3383 dataerrln((UnicodeString)"FAIL: " + DATA[i] + ".getDisplayName() => " +
3384 name + ", expected " + DATA[i+1]);
3385 } else {
3386 logln((UnicodeString)"Ok: " + DATA[i] + ".getDisplayName() => " + name);
3387 }
3388 UErrorCode ec = U_ZERO_ERROR;
3389 UParseError pe;
3390 Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_REVERSE, pe, ec);
3391 if (U_FAILURE(ec)) {
3392 delete t;
3393 dataerrln("FAIL: createInstance failed - %s", u_errorName(ec));
3394 continue;
3395 }
3396 name = Transliterator::getDisplayName(t->getID(), US, name);
3397 if (name != DATA[i+2]) {
3398 dataerrln((UnicodeString)"FAIL: " + t->getID() + ".getDisplayName() => " +
3399 name + ", expected " + DATA[i+2]);
3400 } else {
3401 logln((UnicodeString)"Ok: " + t->getID() + ".getDisplayName() => " + name);
3402 }
3403 delete t;
3404 }
3405 #endif
3406 }
3407
TestSpecialCases()3408 void TransliteratorTest::TestSpecialCases() {
3409 const UnicodeString registerRules[] = {
3410 "Any-Dev1", "x > X; y > Y;",
3411 "Any-Dev2", "XY > Z",
3412 "Greek-Latin/FAKE",
3413 CharsToUnicodeString
3414 ("[^[:L:][:M:]] { \\u03bc\\u03c0 > b ; \\u03bc\\u03c0 } [^[:L:][:M:]] > b ; [^[:L:][:M:]] { [\\u039c\\u03bc][\\u03a0\\u03c0] > B ; [\\u039c\\u03bc][\\u03a0\\u03c0] } [^[:L:][:M:]] > B ;"),
3415 "" // END MARKER
3416 };
3417
3418 const UnicodeString testCases[] = {
3419 // NORMALIZATION
3420 // should add more test cases
3421 "NFD" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3422 "NFC" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3423 "NFKD", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3424 "NFKC", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3425
3426 // mp -> b BUG
3427 "Greek-Latin/UNGEGN", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3428 "Greek-Latin/FAKE", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3429
3430 // check for devanagari bug
3431 "nfd;Dev1;Dev2;nfc", "xy", "Z",
3432
3433 // ff, i, dotless-i, I, dotted-I, LJLjlj deseret deeDEE
3434 "Title", CharsToUnicodeString("ab'cD ffi\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3435 CharsToUnicodeString("Ab'cd Ffi\\u0131ii\\u0307 \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3436
3437 //TODO: enable this test once Titlecase works right
3438 /*
3439 "Title", CharsToUnicodeString("\\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3440 CharsToUnicodeString("Ffi\\u0131ii \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3441 */
3442 "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3443 CharsToUnicodeString("AB'CD FFIII\\u0130 \\u01C7\\u01C7\\u01C7 ") + DESERET_DEE + DESERET_DEE,
3444 "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3445 CharsToUnicodeString("ab'cd \\uFB00i\\u0131ii\\u0307 \\u01C9\\u01C9\\u01C9 ") + DESERET_dee + DESERET_dee,
3446
3447 "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3448 "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3449
3450 // FORMS OF S
3451 "Greek-Latin/UNGEGN", CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3452 CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3453 "Latin-Greek/UNGEGN", CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3454 CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3") ,
3455 "Greek-Latin", CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3456 CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3457 "Latin-Greek", CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3458 CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3459 // Tatiana bug
3460 // Upper: TAT\\u02B9\\u00C2NA
3461 // Lower: tat\\u02B9\\u00E2na
3462 // Title: Tat\\u02B9\\u00E2na
3463 "Upper", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3464 CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3465 "Lower", CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3466 CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3467 "Title", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3468 CharsToUnicodeString("Tat\\u02B9\\u00E2na"),
3469
3470 "" // END MARKER
3471 };
3472
3473 UParseError pos;
3474 int32_t i;
3475 for (i = 0; registerRules[i].length()!=0; i+=2) {
3476 UErrorCode status = U_ZERO_ERROR;
3477
3478 Transliterator *t = Transliterator::createFromRules(registerRules[0+i],
3479 registerRules[i+1], UTRANS_FORWARD, pos, status);
3480 if (U_FAILURE(status)) {
3481 dataerrln("Fails: Unable to create the transliterator from rules. - %s", u_errorName(status));
3482 } else {
3483 Transliterator::registerInstance(t);
3484 }
3485 }
3486 for (i = 0; testCases[i].length()!=0; i+=3) {
3487 UErrorCode ec = U_ZERO_ERROR;
3488 UParseError pe;
3489 const UnicodeString& name = testCases[i];
3490 Transliterator *t = Transliterator::createInstance(name, UTRANS_FORWARD, pe, ec);
3491 if (U_FAILURE(ec)) {
3492 dataerrln((UnicodeString)"FAIL: Couldn't create " + name + " - " + u_errorName(ec));
3493 delete t;
3494 continue;
3495 }
3496 const UnicodeString& id = t->getID();
3497 const UnicodeString& source = testCases[i+1];
3498 UnicodeString target;
3499
3500 // Automatic generation of targets, to make it simpler to add test cases (and more fail-safe)
3501
3502 if (testCases[i+2].length() > 0) {
3503 target = testCases[i+2];
3504 } else if (0==id.caseCompare("NFD", U_FOLD_CASE_DEFAULT)) {
3505 Normalizer::normalize(source, UNORM_NFD, 0, target, ec);
3506 } else if (0==id.caseCompare("NFC", U_FOLD_CASE_DEFAULT)) {
3507 Normalizer::normalize(source, UNORM_NFC, 0, target, ec);
3508 } else if (0==id.caseCompare("NFKD", U_FOLD_CASE_DEFAULT)) {
3509 Normalizer::normalize(source, UNORM_NFKD, 0, target, ec);
3510 } else if (0==id.caseCompare("NFKC", U_FOLD_CASE_DEFAULT)) {
3511 Normalizer::normalize(source, UNORM_NFKC, 0, target, ec);
3512 } else if (0==id.caseCompare("Lower", U_FOLD_CASE_DEFAULT)) {
3513 target = source;
3514 target.toLower(Locale::getUS());
3515 } else if (0==id.caseCompare("Upper", U_FOLD_CASE_DEFAULT)) {
3516 target = source;
3517 target.toUpper(Locale::getUS());
3518 }
3519 if (U_FAILURE(ec)) {
3520 errln((UnicodeString)"FAIL: Internal error normalizing " + source);
3521 continue;
3522 }
3523
3524 expect(*t, source, target);
3525 delete t;
3526 }
3527 for (i = 0; registerRules[i].length()!=0; i+=2) {
3528 Transliterator::unregister(registerRules[i]);
3529 }
3530 }
3531
Char32ToEscapedChars(UChar32 ch,char * buffer,size_t n)3532 char* Char32ToEscapedChars(UChar32 ch, char* buffer, size_t n) {
3533 if (ch <= 0xFFFF) {
3534 snprintf(buffer, n, "\\u%04x", (int)ch);
3535 } else {
3536 snprintf(buffer, n, "\\U%08x", (int)ch);
3537 }
3538 return buffer;
3539 }
3540
TestSurrogateCasing()3541 void TransliteratorTest::TestSurrogateCasing() {
3542 // check that casing handles surrogates
3543 // titlecase is currently defective
3544 char buffer[20];
3545 char16_t buffer2[20];
3546 UChar32 dee;
3547 U16_GET(DESERET_dee,0, 0, DESERET_dee.length(), dee);
3548 UnicodeString DEE(u_totitle(dee));
3549 if (DEE != DESERET_DEE) {
3550 err("Fails titlecase of surrogates");
3551 err(Char32ToEscapedChars(dee, buffer, sizeof(buffer)));
3552 err(", ");
3553 errln(Char32ToEscapedChars(DEE.char32At(0), buffer, sizeof(buffer)));
3554 }
3555
3556 UnicodeString deeDEETest=DESERET_dee + DESERET_DEE;
3557 UnicodeString deedeeTest = DESERET_dee + DESERET_dee;
3558 UnicodeString DEEDEETest = DESERET_DEE + DESERET_DEE;
3559 UErrorCode status= U_ZERO_ERROR;
3560
3561 u_strToUpper(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), nullptr, &status);
3562 if (U_FAILURE(status) || (UnicodeString(buffer2)!= DEEDEETest)) {
3563 errln("Fails: Can't uppercase surrogates.");
3564 }
3565
3566 status= U_ZERO_ERROR;
3567 u_strToLower(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), nullptr, &status);
3568 if (U_FAILURE(status) || (UnicodeString(buffer2)!= deedeeTest)) {
3569 errln("Fails: Can't lowercase surrogates.");
3570 }
3571 }
3572
_trans(Transliterator & t,const UnicodeString & src,UnicodeString & result)3573 static void _trans(Transliterator& t, const UnicodeString& src,
3574 UnicodeString& result) {
3575 result = src;
3576 t.transliterate(result);
3577 }
3578
_trans(const UnicodeString & id,const UnicodeString & src,UnicodeString & result,UErrorCode ec)3579 static void _trans(const UnicodeString& id, const UnicodeString& src,
3580 UnicodeString& result, UErrorCode ec) {
3581 UParseError pe;
3582 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
3583 if (U_SUCCESS(ec)) {
3584 _trans(*t, src, result);
3585 }
3586 delete t;
3587 }
3588
_findMatch(const UnicodeString & source,const UnicodeString * pairs)3589 static UnicodeString _findMatch(const UnicodeString& source,
3590 const UnicodeString* pairs) {
3591 UnicodeString empty;
3592 for (int32_t i=0; pairs[i].length() > 0; i+=2) {
3593 if (0==source.caseCompare(pairs[i], U_FOLD_CASE_DEFAULT)) {
3594 return pairs[i+1];
3595 }
3596 }
3597 return empty;
3598 }
3599
3600 // Check to see that incremental gets at least part way through a reasonable string.
3601
TestIncrementalProgress()3602 void TransliteratorTest::TestIncrementalProgress() {
3603 UErrorCode ec = U_ZERO_ERROR;
3604 UnicodeString latinTest = "The Quick Brown Fox.";
3605 UnicodeString devaTest;
3606 _trans("Latin-Devanagari", latinTest, devaTest, ec);
3607 UnicodeString kataTest;
3608 _trans("Latin-Katakana", latinTest, kataTest, ec);
3609 if (U_FAILURE(ec)) {
3610 errln("FAIL: Internal error");
3611 return;
3612 }
3613 const UnicodeString tests[] = {
3614 "Any", latinTest,
3615 "Latin", latinTest,
3616 "Halfwidth", latinTest,
3617 "Devanagari", devaTest,
3618 "Katakana", kataTest,
3619 "" // END MARKER
3620 };
3621
3622 UnicodeString test("The Quick Brown Fox Jumped Over The Lazy Dog.");
3623 int32_t i = 0, j=0, k=0;
3624 int32_t sources = Transliterator::countAvailableSources();
3625 for (i = 0; i < sources; i++) {
3626 UnicodeString source;
3627 Transliterator::getAvailableSource(i, source);
3628 UnicodeString test = _findMatch(source, tests);
3629 if (test.length() == 0) {
3630 logln((UnicodeString)"Skipping " + source + "-X");
3631 continue;
3632 }
3633 int32_t targets = Transliterator::countAvailableTargets(source);
3634 for (j = 0; j < targets; j++) {
3635 UnicodeString target;
3636 Transliterator::getAvailableTarget(j, source, target);
3637 int32_t variants = Transliterator::countAvailableVariants(source, target);
3638 for (k =0; k< variants; k++) {
3639 UnicodeString variant;
3640 UParseError err;
3641 UErrorCode status = U_ZERO_ERROR;
3642
3643 Transliterator::getAvailableVariant(k, source, target, variant);
3644 UnicodeString id = source + "-" + target + "/" + variant;
3645
3646 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, err, status);
3647 if (U_FAILURE(status)) {
3648 dataerrln((UnicodeString)"FAIL: Could not create " + id + ", status " + u_errorName(status));
3649 delete t;
3650 continue;
3651 }
3652 status = U_ZERO_ERROR;
3653 CheckIncrementalAux(t, test);
3654
3655 UnicodeString rev;
3656 _trans(*t, test, rev);
3657 Transliterator *inv = t->createInverse(status);
3658 if (U_FAILURE(status)) {
3659 // The following are forward-only, it is OK that creating an inverse will not work:
3660 // 1. Devanagari-Arabic
3661 // 2. Any-*/BGN
3662 // 2a. Any-*/BGN_1981
3663 // 3. Any-*/MNS
3664 // 3a. Any-*/Geminate[d]
3665 //
3666 // 4. If UCONFIG_NO_BREAK_ITERATION is on, Latin-Thai is also not expected to work.
3667 //
3668 // The following are direction="both" transforms with variants, inverting the Any-Xxxx/Variant for
3669 // any of these does not work; see ICU-21911 (not sure whether this is intentional or an ICU bug).
3670 // Unfortunately we do not easily have the info at this point as to whether the original transform
3671 // had direction="both" specified.
3672 // 5. Any-*/UNGEGN
3673 // 6. Any-Ethiopic/*
3674 // 7. Any-Braille/*
3675 // 8. Any-*/Gurage_2013
3676 // 9. Any-*/Gutgarts
3677 // 10. Any-*/Tekie_Alibekit
3678 // 11. Any-*/Xaleget
3679 //
3680 if ( id.compare((UnicodeString)"Devanagari-Arabic/") != 0
3681 && !(id.startsWith((UnicodeString)"Any-") &&
3682 (id.endsWith((UnicodeString)"/BGN") || id.endsWith((UnicodeString)"/BGN_1981") || id.endsWith((UnicodeString)"/MNS") ||
3683 id.endsWith((UnicodeString)"/Geminate") || id.endsWith((UnicodeString)"/Geminated"))
3684 )
3685 #if UCONFIG_NO_BREAK_ITERATION
3686 && id.compare((UnicodeString)"Latin-Thai/") != 0
3687 #endif
3688 && !(logKnownIssue("21911", "ICU4C cannot create inverse of Any-Xxxx/Variant transform created from both-direction transform") &&
3689 id.startsWith((UnicodeString)"Any-") &&
3690 (id.endsWith((UnicodeString)"/UNGEGN") || id.startsWith((UnicodeString)"Any-Ethiopic/") || id.startsWith((UnicodeString)"Any-Braille/") ||
3691 id.endsWith((UnicodeString)"/Gurage_2013") || id.endsWith((UnicodeString)"/Gutgarts") || id.endsWith((UnicodeString)"/Tekie_Alibekit") ||
3692 id.endsWith((UnicodeString)"/Xaleget"))
3693 )
3694 )
3695 {
3696 errln((UnicodeString)"FAIL: Could not create inverse of " + id + ", status " + u_errorName(status));
3697 }
3698 delete t;
3699 delete inv;
3700 continue;
3701 }
3702 CheckIncrementalAux(inv, rev);
3703 delete t;
3704 delete inv;
3705 }
3706 }
3707 }
3708 }
3709
CheckIncrementalAux(const Transliterator * t,const UnicodeString & input)3710 void TransliteratorTest::CheckIncrementalAux(const Transliterator* t,
3711 const UnicodeString& input) {
3712 UErrorCode ec = U_ZERO_ERROR;
3713 UTransPosition pos;
3714 UnicodeString test = input;
3715
3716 pos.contextStart = 0;
3717 pos.contextLimit = input.length();
3718 pos.start = 0;
3719 pos.limit = input.length();
3720
3721 t->transliterate(test, pos, ec);
3722 if (U_FAILURE(ec)) {
3723 errln((UnicodeString)"FAIL: transliterate() error " + u_errorName(ec));
3724 return;
3725 }
3726 UBool gotError = false;
3727 (void)gotError; // Suppress set but not used warning.
3728
3729 // we have a few special cases. Any-Remove (pos.start = 0, but also = limit) and U+XXXXX?X?
3730
3731 if (pos.start == 0 && pos.limit != 0 && t->getID() != "Hex-Any/Unicode") {
3732 errln((UnicodeString)"No Progress, " +
3733 t->getID() + ": " + formatInput(test, input, pos));
3734 gotError = true;
3735 } else {
3736 logln((UnicodeString)"PASS Progress, " +
3737 t->getID() + ": " + formatInput(test, input, pos));
3738 }
3739 t->finishTransliteration(test, pos);
3740 if (pos.start != pos.limit) {
3741 errln((UnicodeString)"Incomplete, " +
3742 t->getID() + ": " + formatInput(test, input, pos));
3743 gotError = true;
3744 }
3745 }
3746
TestFunction()3747 void TransliteratorTest::TestFunction() {
3748 // Careful with spacing and ';' here: Phrase this exactly
3749 // as toRules() is going to return it. If toRules() changes
3750 // with regard to spacing or ';', then adjust this string.
3751 UnicodeString rule =
3752 "([:Lu:]) > $1 '(' &Lower( $1 ) '=' &Hex( &Any-Lower( $1 ) ) ')';";
3753
3754 UParseError pe;
3755 UErrorCode ec = U_ZERO_ERROR;
3756 Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3757 if (t == nullptr) {
3758 dataerrln("FAIL: createFromRules failed - %s", u_errorName(ec));
3759 return;
3760 }
3761
3762 UnicodeString r;
3763 t->toRules(r, true);
3764 if (r == rule) {
3765 logln((UnicodeString)"OK: toRules() => " + r);
3766 } else {
3767 errln((UnicodeString)"FAIL: toRules() => " + r +
3768 ", expected " + rule);
3769 }
3770
3771 expect(*t, "The Quick Brown Fox",
3772 UNICODE_STRING_SIMPLE("T(t=\\u0074)he Q(q=\\u0071)uick B(b=\\u0062)rown F(f=\\u0066)ox"));
3773
3774 delete t;
3775 }
3776
TestInvalidBackRef()3777 void TransliteratorTest::TestInvalidBackRef() {
3778 UnicodeString rule = ". > $1;";
3779 UnicodeString rule2 =CharsToUnicodeString("(.) <> &hex/unicode($1) &name($1); . > $1; [{}] >\\u0020;");
3780 UParseError pe;
3781 UErrorCode ec = U_ZERO_ERROR;
3782 Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3783 Transliterator *t2 = Transliterator::createFromRules("Test2", rule2, UTRANS_FORWARD, pe, ec);
3784
3785 if (t != nullptr) {
3786 errln("FAIL: createFromRules should have returned nullptr");
3787 delete t;
3788 }
3789
3790 if (t2 != nullptr) {
3791 errln("FAIL: createFromRules should have returned nullptr");
3792 delete t2;
3793 }
3794
3795 if (U_SUCCESS(ec)) {
3796 errln("FAIL: Ok: . > $1; => no error");
3797 } else {
3798 logln((UnicodeString)"Ok: . > $1; => " + u_errorName(ec));
3799 }
3800 }
3801
TestMulticharStringSet()3802 void TransliteratorTest::TestMulticharStringSet() {
3803 // Basic testing
3804 const char* rule =
3805 " [{aa}] > x;"
3806 " a > y;"
3807 " [b{bc}] > z;"
3808 "[{gd}] { e > q;"
3809 " e } [{fg}] > r;" ;
3810
3811 UParseError pe;
3812 UErrorCode ec = U_ZERO_ERROR;
3813 Transliterator* t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3814 if (t == nullptr || U_FAILURE(ec)) {
3815 delete t;
3816 errln("FAIL: createFromRules failed");
3817 return;
3818 }
3819
3820 expect(*t, "a aa ab bc d gd de gde gdefg ddefg",
3821 "y x yz z d gd de gdq gdqfg ddrfg");
3822 delete t;
3823
3824 // Overlapped string test. Make sure that when multiple
3825 // strings can match that the longest one is matched.
3826 rule =
3827 " [a {ab} {abc}] > x;"
3828 " b > y;"
3829 " c > z;"
3830 " q [t {st} {rst}] { e > p;" ;
3831
3832 t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3833 if (t == nullptr || U_FAILURE(ec)) {
3834 delete t;
3835 errln("FAIL: createFromRules failed");
3836 return;
3837 }
3838
3839 expect(*t, "a ab abc qte qste qrste",
3840 "x x x qtp qstp qrstp");
3841 delete t;
3842 }
3843
3844 // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
3845 // BEGIN TestUserFunction support factory
3846
3847 Transliterator* _TUFF[4];
3848 UnicodeString* _TUFID[4];
3849
_TUFFactory(const UnicodeString &,Transliterator::Token context)3850 static Transliterator* U_EXPORT2 _TUFFactory(const UnicodeString& /*ID*/,
3851 Transliterator::Token context) {
3852 return _TUFF[context.integer]->clone();
3853 }
3854
_TUFReg(const UnicodeString & ID,Transliterator * t,int32_t n)3855 static void _TUFReg(const UnicodeString& ID, Transliterator* t, int32_t n) {
3856 _TUFF[n] = t;
3857 _TUFID[n] = new UnicodeString(ID);
3858 Transliterator::registerFactory(ID, _TUFFactory, Transliterator::integerToken(n));
3859 }
3860
_TUFUnreg(int32_t n)3861 static void _TUFUnreg(int32_t n) {
3862 if (_TUFF[n] != nullptr) {
3863 Transliterator::unregister(*_TUFID[n]);
3864 delete _TUFF[n];
3865 delete _TUFID[n];
3866 }
3867 }
3868
3869 // END TestUserFunction support factory
3870 // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3871
3872 /**
3873 * Test that user-registered transliterators can be used under function
3874 * syntax.
3875 */
TestUserFunction()3876 void TransliteratorTest::TestUserFunction() {
3877
3878 Transliterator* t;
3879 UParseError pe;
3880 UErrorCode ec = U_ZERO_ERROR;
3881
3882 // Setup our factory
3883 int32_t i;
3884 for (i=0; i<4; ++i) {
3885 _TUFF[i] = nullptr;
3886 }
3887
3888 // There's no need to register inverses if we don't use them
3889 t = Transliterator::createFromRules("gif",
3890 UNICODE_STRING_SIMPLE("'\\'u(..)(..) > '<img src=\"http://www.unicode.org/gifs/24/' $1 '/U' $1$2 '.gif\">';"),
3891 UTRANS_FORWARD, pe, ec);
3892 if (t == nullptr || U_FAILURE(ec)) {
3893 dataerrln((UnicodeString)"FAIL: createFromRules gif " + u_errorName(ec));
3894 return;
3895 }
3896 _TUFReg("Any-gif", t, 0);
3897
3898 t = Transliterator::createFromRules("RemoveCurly",
3899 UNICODE_STRING_SIMPLE("[\\{\\}] > ; '\\N' > ;"),
3900 UTRANS_FORWARD, pe, ec);
3901 if (t == nullptr || U_FAILURE(ec)) {
3902 errln((UnicodeString)"FAIL: createFromRules RemoveCurly " + u_errorName(ec));
3903 goto FAIL;
3904 }
3905 expect(*t, UNICODE_STRING_SIMPLE("\\N{name}"), "name");
3906 _TUFReg("Any-RemoveCurly", t, 1);
3907
3908 logln("Trying &hex");
3909 t = Transliterator::createFromRules("hex2",
3910 "(.) > &hex($1);",
3911 UTRANS_FORWARD, pe, ec);
3912 if (t == nullptr || U_FAILURE(ec)) {
3913 errln("FAIL: createFromRules");
3914 goto FAIL;
3915 }
3916 logln("Registering");
3917 _TUFReg("Any-hex2", t, 2);
3918 t = Transliterator::createInstance("Any-hex2", UTRANS_FORWARD, ec);
3919 if (t == nullptr || U_FAILURE(ec)) {
3920 errln((UnicodeString)"FAIL: createInstance Any-hex2 " + u_errorName(ec));
3921 goto FAIL;
3922 }
3923 expect(*t, "abc", UNICODE_STRING_SIMPLE("\\u0061\\u0062\\u0063"));
3924 delete t;
3925
3926 logln("Trying &gif");
3927 t = Transliterator::createFromRules("gif2",
3928 "(.) > &Gif(&Hex2($1));",
3929 UTRANS_FORWARD, pe, ec);
3930 if (t == nullptr || U_FAILURE(ec)) {
3931 errln((UnicodeString)"FAIL: createFromRules gif2 " + u_errorName(ec));
3932 goto FAIL;
3933 }
3934 logln("Registering");
3935 _TUFReg("Any-gif2", t, 3);
3936 t = Transliterator::createInstance("Any-gif2", UTRANS_FORWARD, ec);
3937 if (t == nullptr || U_FAILURE(ec)) {
3938 errln((UnicodeString)"FAIL: createInstance Any-gif2 " + u_errorName(ec));
3939 goto FAIL;
3940 }
3941 expect(*t, "ab", "<img src=\"http://www.unicode.org/gifs/24/00/U0061.gif\">"
3942 "<img src=\"http://www.unicode.org/gifs/24/00/U0062.gif\">");
3943 delete t;
3944
3945 // Test that filters are allowed after &
3946 t = Transliterator::createFromRules("test",
3947 "(.) > &Hex($1) ' ' &RemoveCurly(&Name($1)) ' ';",
3948 UTRANS_FORWARD, pe, ec);
3949 if (t == nullptr || U_FAILURE(ec)) {
3950 errln((UnicodeString)"FAIL: createFromRules test " + u_errorName(ec));
3951 goto FAIL;
3952 }
3953 expect(*t, "abc",
3954 UNICODE_STRING_SIMPLE("\\u0061 LATIN SMALL LETTER A \\u0062 LATIN SMALL LETTER B \\u0063 LATIN SMALL LETTER C "));
3955 delete t;
3956
3957 FAIL:
3958 for (i=0; i<4; ++i) {
3959 _TUFUnreg(i);
3960 }
3961 }
3962
3963 /**
3964 * Test the Any-X transliterators.
3965 */
TestAnyX()3966 void TransliteratorTest::TestAnyX() {
3967 UParseError parseError;
3968 UErrorCode status = U_ZERO_ERROR;
3969 Transliterator* anyLatin =
3970 Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
3971 if (anyLatin==0) {
3972 dataerrln("FAIL: createInstance returned nullptr - %s", u_errorName(status));
3973 delete anyLatin;
3974 return;
3975 }
3976
3977 expect(*anyLatin,
3978 CharsToUnicodeString("greek:\\u03B1\\u03B2\\u03BA\\u0391\\u0392\\u039A hiragana:\\u3042\\u3076\\u304F cyrillic:\\u0430\\u0431\\u0446"),
3979 CharsToUnicodeString("greek:abkABK hiragana:abuku cyrillic:abc"));
3980
3981 delete anyLatin;
3982 }
3983
3984 /**
3985 * Test Any-X transliterators with sample letters from all scripts.
3986 */
TestAny()3987 void TransliteratorTest::TestAny() {
3988 UErrorCode status = U_ZERO_ERROR;
3989 // Note: there is a lot of implicit construction of UnicodeStrings from (char *) in
3990 // function call parameters going on in this test.
3991 UnicodeSet alphabetic("[:alphabetic:]", status);
3992 if (U_FAILURE(status)) {
3993 dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
3994 return;
3995 }
3996 alphabetic.freeze();
3997
3998 UnicodeString testString;
3999 for (int32_t i = 0; i < USCRIPT_CODE_LIMIT; i++) {
4000 const char *scriptName = uscript_getShortName((UScriptCode)i);
4001 if (scriptName == nullptr) {
4002 errln("Failure: file %s, line %d: Script Code %d is invalid, ", __FILE__, __LINE__, i);
4003 return;
4004 }
4005
4006 UnicodeSet sample;
4007 sample.applyPropertyAlias("script", scriptName, status);
4008 if (U_FAILURE(status)) {
4009 errln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
4010 return;
4011 }
4012 sample.retainAll(alphabetic);
4013 for (int32_t count=0; count<5; count++) {
4014 UChar32 c = sample.charAt(count);
4015 if (c == -1) {
4016 break;
4017 }
4018 testString.append(c);
4019 }
4020 }
4021
4022 UParseError parseError;
4023 Transliterator* anyLatin =
4024 Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
4025 if (U_FAILURE(status)) {
4026 dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
4027 return;
4028 }
4029
4030 logln(UnicodeString("Sample set for Any-Latin: ") + testString);
4031 anyLatin->transliterate(testString);
4032 logln(UnicodeString("Sample result for Any-Latin: ") + testString);
4033 delete anyLatin;
4034 }
4035
4036
4037 /**
4038 * Test the source and target set API. These are only implemented
4039 * for RBT and CompoundTransliterator at this time.
4040 */
TestSourceTargetSet()4041 void TransliteratorTest::TestSourceTargetSet() {
4042 UErrorCode ec = U_ZERO_ERROR;
4043
4044 // Rules
4045 const char* r =
4046 "a > b; "
4047 "r [x{lu}] > q;";
4048
4049 // Expected source
4050 UnicodeSet expSrc("[arx{lu}]", ec);
4051
4052 // Expected target
4053 UnicodeSet expTrg("[bq]", ec);
4054
4055 UParseError pe;
4056 Transliterator* t = Transliterator::createFromRules("test", r, UTRANS_FORWARD, pe, ec);
4057
4058 if (U_FAILURE(ec)) {
4059 delete t;
4060 errln("FAIL: Couldn't set up test");
4061 return;
4062 }
4063
4064 UnicodeSet src; t->getSourceSet(src);
4065 UnicodeSet trg; t->getTargetSet(trg);
4066
4067 if (src == expSrc && trg == expTrg) {
4068 UnicodeString a, b;
4069 logln((UnicodeString)"Ok: " +
4070 r + " => source = " + src.toPattern(a, true) +
4071 ", target = " + trg.toPattern(b, true));
4072 } else {
4073 UnicodeString a, b, c, d;
4074 errln((UnicodeString)"FAIL: " +
4075 r + " => source = " + src.toPattern(a, true) +
4076 ", expected " + expSrc.toPattern(b, true) +
4077 "; target = " + trg.toPattern(c, true) +
4078 ", expected " + expTrg.toPattern(d, true));
4079 }
4080
4081 delete t;
4082 }
4083
4084 /**
4085 * Test handling of Pattern_White_Space, for both RBT and UnicodeSet.
4086 */
TestPatternWhiteSpace()4087 void TransliteratorTest::TestPatternWhiteSpace() {
4088 // Rules
4089 const char* r = "a > \\u200E b;";
4090
4091 UErrorCode ec = U_ZERO_ERROR;
4092 UParseError pe;
4093 Transliterator* t = Transliterator::createFromRules("test", CharsToUnicodeString(r), UTRANS_FORWARD, pe, ec);
4094
4095 if (U_FAILURE(ec)) {
4096 errln("FAIL: Couldn't set up test");
4097 } else {
4098 expect(*t, "a", "b");
4099 }
4100 delete t;
4101
4102 // UnicodeSet
4103 ec = U_ZERO_ERROR;
4104 UnicodeSet set(CharsToUnicodeString("[a \\u200E]"), ec);
4105
4106 if (U_FAILURE(ec)) {
4107 errln("FAIL: Couldn't set up test");
4108 } else {
4109 if (set.contains(0x200E)) {
4110 errln("FAIL: U+200E not being ignored by UnicodeSet");
4111 }
4112 }
4113 }
4114 //======================================================================
4115 // this method is in TestUScript.java
4116 //======================================================================
TestAllCodepoints()4117 void TransliteratorTest::TestAllCodepoints(){
4118 UScriptCode code= USCRIPT_INVALID_CODE;
4119 char id[256]={'\0'};
4120 char abbr[256]={'\0'};
4121 char newId[256]={'\0'};
4122 char newAbbrId[256]={'\0'};
4123 char oldId[256]={'\0'};
4124 char oldAbbrId[256]={'\0'};
4125
4126 UErrorCode status =U_ZERO_ERROR;
4127 UParseError pe;
4128
4129 for(uint32_t i = 0; i<=0x10ffff; i++){
4130 code = uscript_getScript(i,&status);
4131 if(code == USCRIPT_INVALID_CODE){
4132 dataerrln("uscript_getScript for codepoint \\U%08X failed.", i);
4133 }
4134 const char* myId = uscript_getName(code);
4135 if(!myId) {
4136 dataerrln("Valid script code returned nullptr name. Check your data!");
4137 return;
4138 }
4139 uprv_strcpy(id,myId);
4140 uprv_strcpy(abbr,uscript_getShortName(code));
4141
4142 uprv_strcpy(newId,"[:");
4143 uprv_strcat(newId,id);
4144 uprv_strcat(newId,":];NFD");
4145
4146 uprv_strcpy(newAbbrId,"[:");
4147 uprv_strcat(newAbbrId,abbr);
4148 uprv_strcat(newAbbrId,":];NFD");
4149
4150 if(uprv_strcmp(newId,oldId)!=0){
4151 Transliterator* t = Transliterator::createInstance(newId,UTRANS_FORWARD,pe,status);
4152 if(t==nullptr || U_FAILURE(status)){
4153 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4154 }
4155 delete t;
4156 }
4157 if(uprv_strcmp(newAbbrId,oldAbbrId)!=0){
4158 Transliterator* t = Transliterator::createInstance(newAbbrId,UTRANS_FORWARD,pe,status);
4159 if(t==nullptr || U_FAILURE(status)){
4160 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4161 }
4162 delete t;
4163 }
4164 uprv_strcpy(oldId,newId);
4165 uprv_strcpy(oldAbbrId, newAbbrId);
4166
4167 }
4168
4169 }
4170
4171 #define TEST_TRANSLIT_ID(id, cls) UPRV_BLOCK_MACRO_BEGIN { \
4172 UErrorCode ec = U_ZERO_ERROR; \
4173 Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); \
4174 if (U_FAILURE(ec)) { \
4175 dataerrln("FAIL: Couldn't create %s - %s", id, u_errorName(ec)); \
4176 } else { \
4177 if (t->getDynamicClassID() != cls::getStaticClassID()) { \
4178 errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4179 } \
4180 /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4181 } \
4182 delete t; \
4183 } UPRV_BLOCK_MACRO_END
4184
4185 #define TEST_TRANSLIT_RULE(rule, cls) UPRV_BLOCK_MACRO_BEGIN { \
4186 UErrorCode ec = U_ZERO_ERROR; \
4187 UParseError pe; \
4188 Transliterator* t = Transliterator::createFromRules("_", rule, UTRANS_FORWARD, pe, ec); \
4189 if (U_FAILURE(ec)) { \
4190 errln("FAIL: Couldn't create " rule); \
4191 } else { \
4192 if (t->getDynamicClassID() != cls ::getStaticClassID()) { \
4193 errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4194 } \
4195 /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4196 } \
4197 delete t; \
4198 } UPRV_BLOCK_MACRO_END
4199
TestBoilerplate()4200 void TransliteratorTest::TestBoilerplate() {
4201 TEST_TRANSLIT_ID("Any-Latin", AnyTransliterator);
4202 TEST_TRANSLIT_ID("Any-Hex", EscapeTransliterator);
4203 TEST_TRANSLIT_ID("Hex-Any", UnescapeTransliterator);
4204 TEST_TRANSLIT_ID("Lower", LowercaseTransliterator);
4205 TEST_TRANSLIT_ID("Upper", UppercaseTransliterator);
4206 TEST_TRANSLIT_ID("Title", TitlecaseTransliterator);
4207 TEST_TRANSLIT_ID("Null", NullTransliterator);
4208 TEST_TRANSLIT_ID("Remove", RemoveTransliterator);
4209 TEST_TRANSLIT_ID("Any-Name", UnicodeNameTransliterator);
4210 TEST_TRANSLIT_ID("Name-Any", NameUnicodeTransliterator);
4211 TEST_TRANSLIT_ID("NFD", NormalizationTransliterator);
4212 TEST_TRANSLIT_ID("Latin-Greek", CompoundTransliterator);
4213 TEST_TRANSLIT_RULE("a>b;", RuleBasedTransliterator);
4214 }
4215
TestAlternateSyntax()4216 void TransliteratorTest::TestAlternateSyntax() {
4217 // U+2206 == &
4218 // U+2190 == <
4219 // U+2192 == >
4220 // U+2194 == <>
4221 expect(CharsToUnicodeString("a \\u2192 x; b \\u2190 y; c \\u2194 z"),
4222 "abc",
4223 "xbz");
4224 expect(CharsToUnicodeString("([:^ASCII:]) \\u2192 \\u2206Name($1);"),
4225 CharsToUnicodeString("<=\\u2190; >=\\u2192; <>=\\u2194; &=\\u2206"),
4226 UNICODE_STRING_SIMPLE("<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}"));
4227 }
4228
4229 static const char* BEGIN_END_RULES[] = {
4230 // [0]
4231 "abc > xy;"
4232 "aba > z;",
4233
4234 // [1]
4235 /*
4236 "::BEGIN;"
4237 "abc > xy;"
4238 "::END;"
4239 "::BEGIN;"
4240 "aba > z;"
4241 "::END;",
4242 */
4243 "", // test case commented out below, this is here to keep from messing up the indexes
4244
4245 // [2]
4246 /*
4247 "abc > xy;"
4248 "::BEGIN;"
4249 "aba > z;"
4250 "::END;",
4251 */
4252 "", // test case commented out below, this is here to keep from messing up the indexes
4253
4254 // [3]
4255 /*
4256 "::BEGIN;"
4257 "abc > xy;"
4258 "::END;"
4259 "aba > z;",
4260 */
4261 "", // test case commented out below, this is here to keep from messing up the indexes
4262
4263 // [4]
4264 "abc > xy;"
4265 "::Null;"
4266 "aba > z;",
4267
4268 // [5]
4269 "::Upper;"
4270 "ABC > xy;"
4271 "AB > x;"
4272 "C > z;"
4273 "::Upper;"
4274 "XYZ > p;"
4275 "XY > q;"
4276 "Z > r;"
4277 "::Upper;",
4278
4279 // [6]
4280 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4281 "$delim = [\\-$ws];"
4282 "$ws $delim* > ' ';"
4283 "'-' $delim* > '-';",
4284
4285 // [7]
4286 "::Null;"
4287 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4288 "$delim = [\\-$ws];"
4289 "$ws $delim* > ' ';"
4290 "'-' $delim* > '-';",
4291
4292 // [8]
4293 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4294 "$delim = [\\-$ws];"
4295 "$ws $delim* > ' ';"
4296 "'-' $delim* > '-';"
4297 "::Null;",
4298
4299 // [9]
4300 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4301 "$delim = [\\-$ws];"
4302 "::Null;"
4303 "$ws $delim* > ' ';"
4304 "'-' $delim* > '-';",
4305
4306 // [10]
4307 /*
4308 "::BEGIN;"
4309 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4310 "$delim = [\\-$ws];"
4311 "::END;"
4312 "$ws $delim* > ' ';"
4313 "'-' $delim* > '-';",
4314 */
4315 "", // test case commented out below, this is here to keep from messing up the indexes
4316
4317 // [11]
4318 /*
4319 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4320 "$delim = [\\-$ws];"
4321 "::BEGIN;"
4322 "$ws $delim* > ' ';"
4323 "'-' $delim* > '-';"
4324 "::END;",
4325 */
4326 "", // test case commented out below, this is here to keep from messing up the indexes
4327
4328 // [12]
4329 /*
4330 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4331 "$delim = [\\-$ws];"
4332 "$ab = [ab];"
4333 "::BEGIN;"
4334 "$ws $delim* > ' ';"
4335 "'-' $delim* > '-';"
4336 "::END;"
4337 "::BEGIN;"
4338 "$ab { ' ' } $ab > '-';"
4339 "c { ' ' > ;"
4340 "::END;"
4341 "::BEGIN;"
4342 "'a-a' > a\\%|a;"
4343 "::END;",
4344 */
4345 "", // test case commented out below, this is here to keep from messing up the indexes
4346
4347 // [13]
4348 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4349 "$delim = [\\-$ws];"
4350 "$ab = [ab];"
4351 "::Null;"
4352 "$ws $delim* > ' ';"
4353 "'-' $delim* > '-';"
4354 "::Null;"
4355 "$ab { ' ' } $ab > '-';"
4356 "c { ' ' > ;"
4357 "::Null;"
4358 "'a-a' > a\\%|a;",
4359
4360 // [14]
4361 /*
4362 "::[abc];"
4363 "::BEGIN;"
4364 "abc > xy;"
4365 "::END;"
4366 "::BEGIN;"
4367 "aba > yz;"
4368 "::END;"
4369 "::Upper;",
4370 */
4371 "", // test case commented out below, this is here to keep from messing up the indexes
4372
4373 // [15]
4374 "::[abc];"
4375 "abc > xy;"
4376 "::Null;"
4377 "aba > yz;"
4378 "::Upper;",
4379
4380 // [16]
4381 /*
4382 "::[abc];"
4383 "::BEGIN;"
4384 "abc <> xy;"
4385 "::END;"
4386 "::BEGIN;"
4387 "aba <> yz;"
4388 "::END;"
4389 "::Upper(Lower);"
4390 "::([XYZ]);"
4391 */
4392 "", // test case commented out below, this is here to keep from messing up the indexes
4393
4394 // [17]
4395 "::[abc];"
4396 "abc <> xy;"
4397 "::Null;"
4398 "aba <> yz;"
4399 "::Upper(Lower);"
4400 "::([XYZ]);"
4401 };
4402
4403 /*
4404 (This entire test is commented out below and will need some heavy revision when we re-add
4405 the ::BEGIN/::END stuff)
4406 static const char* BOGUS_BEGIN_END_RULES[] = {
4407 // [7]
4408 "::BEGIN;"
4409 "abc > xy;"
4410 "::BEGIN;"
4411 "aba > z;"
4412 "::END;"
4413 "::END;",
4414
4415 // [8]
4416 "abc > xy;"
4417 " aba > z;"
4418 "::END;",
4419
4420 // [9]
4421 "::BEGIN;"
4422 "::Upper;"
4423 "::END;"
4424 };
4425 static const int32_t BOGUS_BEGIN_END_RULES_length = UPRV_LENGTHOF(BOGUS_BEGIN_END_RULES);
4426 */
4427
4428 static const char* BEGIN_END_TEST_CASES[] = {
4429 // rules input expected output
4430 BEGIN_END_RULES[0], "abc ababc aba", "xy zbc z",
4431 // BEGIN_END_RULES[1], "abc ababc aba", "xy abxy z",
4432 // BEGIN_END_RULES[2], "abc ababc aba", "xy abxy z",
4433 // BEGIN_END_RULES[3], "abc ababc aba", "xy abxy z",
4434 BEGIN_END_RULES[4], "abc ababc aba", "xy abxy z",
4435 BEGIN_END_RULES[5], "abccabaacababcbc", "PXAARXQBR",
4436
4437 BEGIN_END_RULES[6], "e e - e---e- e", "e e e-e-e",
4438 BEGIN_END_RULES[7], "e e - e---e- e", "e e e-e-e",
4439 BEGIN_END_RULES[8], "e e - e---e- e", "e e e-e-e",
4440 BEGIN_END_RULES[9], "e e - e---e- e", "e e e-e-e",
4441 // BEGIN_END_RULES[10], "e e - e---e- e", "e e e-e-e",
4442 // BEGIN_END_RULES[11], "e e - e---e- e", "e e e-e-e",
4443 // BEGIN_END_RULES[12], "e e - e---e- e", "e e e-e-e",
4444 // BEGIN_END_RULES[12], "a a a a", "a%a%a%a",
4445 // BEGIN_END_RULES[12], "a a-b c b a", "a%a-b cb-a",
4446 BEGIN_END_RULES[13], "e e - e---e- e", "e e e-e-e",
4447 BEGIN_END_RULES[13], "a a a a", "a%a%a%a",
4448 BEGIN_END_RULES[13], "a a-b c b a", "a%a-b cb-a",
4449
4450 // BEGIN_END_RULES[14], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4451 BEGIN_END_RULES[15], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4452 // BEGIN_END_RULES[16], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4453 BEGIN_END_RULES[17], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ"
4454 };
4455 static const int32_t BEGIN_END_TEST_CASES_length = UPRV_LENGTHOF(BEGIN_END_TEST_CASES);
4456
TestBeginEnd()4457 void TransliteratorTest::TestBeginEnd() {
4458 // run through the list of test cases above
4459 int32_t i = 0;
4460 for (i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4461 expect((UnicodeString)"Test case #" + (i / 3),
4462 UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4463 UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4464 UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4465 }
4466
4467 // instantiate the one reversible rule set in the reverse direction and make sure it does the right thing
4468 UParseError parseError;
4469 UErrorCode status = U_ZERO_ERROR;
4470 Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4471 UTRANS_REVERSE, parseError, status);
4472 if (reversed == 0 || U_FAILURE(status)) {
4473 reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4474 } else {
4475 expect(*reversed, UnicodeString("xy XY XYZ yz YZ"), UnicodeString("xy abc xaba yz aba"));
4476 }
4477 delete reversed;
4478
4479 // finally, run through the list of syntactically-ill-formed rule sets above and make sure
4480 // that all of them cause errors
4481 /*
4482 (commented out until we have the real ::BEGIN/::END stuff in place
4483 for (i = 0; i < BOGUS_BEGIN_END_RULES_length; i++) {
4484 UParseError parseError;
4485 UErrorCode status = U_ZERO_ERROR;
4486 Transliterator* t = Transliterator::createFromRules("foo", UnicodeString(BOGUS_BEGIN_END_RULES[i]),
4487 UTRANS_FORWARD, parseError, status);
4488 if (!U_FAILURE(status)) {
4489 delete t;
4490 errln((UnicodeString)"Should have gotten syntax error from " + BOGUS_BEGIN_END_RULES[i]);
4491 }
4492 }
4493 */
4494 }
4495
TestBeginEndToRules()4496 void TransliteratorTest::TestBeginEndToRules() {
4497 // run through the same list of test cases we used above, but this time, instead of just
4498 // instantiating a Transliterator from the rules and running the test against it, we instantiate
4499 // a Transliterator from the rules, do toRules() on it, instantiate a Transliterator from
4500 // the resulting set of rules, and make sure that the generated rule set is semantically equivalent
4501 // to (i.e., does the same thing as) the original rule set
4502 for (int32_t i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4503 UParseError parseError;
4504 UErrorCode status = U_ZERO_ERROR;
4505 Transliterator* t = Transliterator::createFromRules("--", UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4506 UTRANS_FORWARD, parseError, status);
4507 if (U_FAILURE(status)) {
4508 reportParseError(UnicodeString("FAIL: Couldn't create transliterator"), parseError, status);
4509 } else {
4510 UnicodeString rules;
4511 t->toRules(rules, true);
4512 Transliterator* t2 = Transliterator::createFromRules((UnicodeString)"Test case #" + (i / 3), rules,
4513 UTRANS_FORWARD, parseError, status);
4514 if (U_FAILURE(status)) {
4515 reportParseError(UnicodeString("FAIL: Couldn't create transliterator from generated rules"),
4516 parseError, status);
4517 delete t;
4518 } else {
4519 expect(*t2,
4520 UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4521 UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4522 delete t;
4523 delete t2;
4524 }
4525 }
4526 }
4527
4528 // do the same thing for the reversible test case
4529 UParseError parseError;
4530 UErrorCode status = U_ZERO_ERROR;
4531 Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4532 UTRANS_REVERSE, parseError, status);
4533 if (U_FAILURE(status)) {
4534 reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4535 } else {
4536 UnicodeString rules;
4537 reversed->toRules(rules, false);
4538 Transliterator* reversed2 = Transliterator::createFromRules("Reversed", rules, UTRANS_FORWARD,
4539 parseError, status);
4540 if (U_FAILURE(status)) {
4541 reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator from generated rules"),
4542 parseError, status);
4543 delete reversed;
4544 } else {
4545 expect(*reversed2,
4546 UnicodeString("xy XY XYZ yz YZ"),
4547 UnicodeString("xy abc xaba yz aba"));
4548 delete reversed;
4549 delete reversed2;
4550 }
4551 }
4552 }
4553
TestRegisterAlias()4554 void TransliteratorTest::TestRegisterAlias() {
4555 UnicodeString longID("Lower;[aeiou]Upper");
4556 UnicodeString shortID("Any-CapVowels");
4557 UnicodeString reallyShortID("CapVowels");
4558
4559 Transliterator::registerAlias(shortID, longID);
4560
4561 UErrorCode err = U_ZERO_ERROR;
4562 Transliterator* t1 = Transliterator::createInstance(longID, UTRANS_FORWARD, err);
4563 if (U_FAILURE(err)) {
4564 errln("Failed to instantiate transliterator with long ID");
4565 Transliterator::unregister(shortID);
4566 return;
4567 }
4568 Transliterator* t2 = Transliterator::createInstance(reallyShortID, UTRANS_FORWARD, err);
4569 if (U_FAILURE(err)) {
4570 errln("Failed to instantiate transliterator with short ID");
4571 delete t1;
4572 Transliterator::unregister(shortID);
4573 return;
4574 }
4575
4576 if (t1->getID() != longID)
4577 errln("Transliterator instantiated with long ID doesn't have long ID");
4578 if (t2->getID() != reallyShortID)
4579 errln("Transliterator instantiated with short ID doesn't have short ID");
4580
4581 UnicodeString rules1;
4582 UnicodeString rules2;
4583
4584 t1->toRules(rules1, true);
4585 t2->toRules(rules2, true);
4586 if (rules1 != rules2)
4587 errln("Alias transliterators aren't the same");
4588
4589 delete t1;
4590 delete t2;
4591 Transliterator::unregister(shortID);
4592
4593 t1 = Transliterator::createInstance(shortID, UTRANS_FORWARD, err);
4594 if (U_SUCCESS(err)) {
4595 errln("Instantiation with short ID succeeded after short ID was unregistered");
4596 delete t1;
4597 }
4598
4599 // try the same thing again, but this time with something other than
4600 // an instance of CompoundTransliterator
4601 UnicodeString realID("Latin-Greek");
4602 UnicodeString fakeID("Latin-dlgkjdflkjdl");
4603 Transliterator::registerAlias(fakeID, realID);
4604
4605 err = U_ZERO_ERROR;
4606 t1 = Transliterator::createInstance(realID, UTRANS_FORWARD, err);
4607 if (U_FAILURE(err)) {
4608 dataerrln("Failed to instantiate transliterator with real ID - %s", u_errorName(err));
4609 Transliterator::unregister(realID);
4610 return;
4611 }
4612 t2 = Transliterator::createInstance(fakeID, UTRANS_FORWARD, err);
4613 if (U_FAILURE(err)) {
4614 errln("Failed to instantiate transliterator with fake ID");
4615 delete t1;
4616 Transliterator::unregister(realID);
4617 return;
4618 }
4619
4620 t1->toRules(rules1, true);
4621 t2->toRules(rules2, true);
4622 if (rules1 != rules2)
4623 errln("Alias transliterators aren't the same");
4624
4625 delete t1;
4626 delete t2;
4627 Transliterator::unregister(fakeID);
4628 }
4629
TestRuleStripping()4630 void TransliteratorTest::TestRuleStripping() {
4631 /*
4632 #
4633 \uE001>\u0C01; # SIGN
4634 */
4635 static const char16_t rule[] = {
4636 0x0023,0x0020,0x000D,0x000A,
4637 0xE001,0x003E,0x0C01,0x003B,0x0020,0x0023,0x0020,0x0053,0x0049,0x0047,0x004E,0
4638 };
4639 static const char16_t expectedRule[] = {
4640 0xE001,0x003E,0x0C01,0x003B,0
4641 };
4642 char16_t result[UPRV_LENGTHOF(rule)];
4643 UErrorCode status = U_ZERO_ERROR;
4644 int32_t len = utrans_stripRules(rule, UPRV_LENGTHOF(rule), result, &status);
4645 if (len != u_strlen(expectedRule)) {
4646 errln("utrans_stripRules return len = %d", len);
4647 }
4648 if (u_strncmp(expectedRule, result, len) != 0) {
4649 errln("utrans_stripRules did not return expected string");
4650 }
4651 }
4652
4653 /**
4654 * Test the Halfwidth-Fullwidth transliterator (ticket 6281).
4655 */
TestHalfwidthFullwidth()4656 void TransliteratorTest::TestHalfwidthFullwidth() {
4657 UParseError parseError;
4658 UErrorCode status = U_ZERO_ERROR;
4659 Transliterator* hf = Transliterator::createInstance("Halfwidth-Fullwidth", UTRANS_FORWARD, parseError, status);
4660 Transliterator* fh = Transliterator::createInstance("Fullwidth-Halfwidth", UTRANS_FORWARD, parseError, status);
4661 if (hf == 0 || fh == 0) {
4662 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4663 delete hf;
4664 delete fh;
4665 return;
4666 }
4667
4668 // Array of 2n items
4669 // Each item is
4670 // "hf"|"fh"|"both",
4671 // <Halfwidth>,
4672 // <Fullwidth>
4673 const char* DATA[] = {
4674 "both",
4675 "\\uFFE9\\uFFEA\\uFFEB\\uFFEC\\u0061\\uFF71\\u00AF\\u0020",
4676 "\\u2190\\u2191\\u2192\\u2193\\uFF41\\u30A2\\uFFE3\\u3000",
4677 };
4678 int32_t DATA_length = UPRV_LENGTHOF(DATA);
4679
4680 for (int32_t i=0; i<DATA_length; i+=3) {
4681 UnicodeString h = CharsToUnicodeString(DATA[i+1]);
4682 UnicodeString f = CharsToUnicodeString(DATA[i+2]);
4683 switch (*DATA[i]) {
4684 case 0x68: //'h': // Halfwidth-Fullwidth only
4685 expect(*hf, h, f);
4686 break;
4687 case 0x66: //'f': // Fullwidth-Halfwidth only
4688 expect(*fh, f, h);
4689 break;
4690 case 0x62: //'b': // both directions
4691 expect(*hf, h, f);
4692 expect(*fh, f, h);
4693 break;
4694 }
4695 }
4696 delete hf;
4697 delete fh;
4698 }
4699
4700
4701 /**
4702 * Test Thai. The text is the first paragraph of "What is Unicode" from the Unicode.org web site.
4703 * TODO: confirm that the expected results are correct.
4704 * For now, test just confirms that C++ and Java give identical results.
4705 */
TestThai()4706 void TransliteratorTest::TestThai() {
4707 #if !UCONFIG_NO_BREAK_ITERATION
4708 // The expectations in this test heavily depends on the Thai dictionary.
4709 // Therefore, we skip this test under the LSTM configuration.
4710 if (skipDictionaryTest()) {
4711 return;
4712 }
4713 UParseError parseError;
4714 UErrorCode status = U_ZERO_ERROR;
4715 Transliterator* tr = Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
4716 if (tr == 0) {
4717 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4718 return;
4719 }
4720 if (U_FAILURE(status)) {
4721 errln("FAIL: createInstance failed with %s", u_errorName(status));
4722 return;
4723 }
4724 const char *thaiText =
4725 "\\u0e42\\u0e14\\u0e22\\u0e1e\\u0e37\\u0e49\\u0e19\\u0e10\\u0e32\\u0e19\\u0e41\\u0e25\\u0e49\\u0e27, \\u0e04\\u0e2d"
4726 "\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d\\u0e23\\u0e4c\\u0e08\\u0e30\\u0e40\\u0e01\\u0e35\\u0e48\\u0e22"
4727 "\\u0e27\\u0e02\\u0e49\\u0e2d\\u0e07\\u0e01\\u0e31\\u0e1a\\u0e40\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e02\\u0e2d"
4728 "\\u0e07\\u0e15\\u0e31\\u0e27\\u0e40\\u0e25\\u0e02. \\u0e04\\u0e2d\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d"
4729 "\\u0e23\\u0e4c\\u0e08\\u0e31\\u0e14\\u0e40\\u0e01\\u0e47\\u0e1a\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29"
4730 "\\u0e23\\u0e41\\u0e25\\u0e30\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30\\u0e2d\\u0e37\\u0e48\\u0e19\\u0e46 \\u0e42"
4731 "\\u0e14\\u0e22\\u0e01\\u0e32\\u0e23\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25"
4732 "\\u0e02\\u0e43\\u0e2b\\u0e49\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e41\\u0e15\\u0e48\\u0e25\\u0e30\\u0e15"
4733 "\\u0e31\\u0e27. \\u0e01\\u0e48\\u0e2d\\u0e19\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48\\u0e4a Unicode \\u0e08"
4734 "\\u0e30\\u0e16\\u0e39\\u0e01\\u0e2a\\u0e23\\u0e49\\u0e32\\u0e07\\u0e02\\u0e36\\u0e49\\u0e19, \\u0e44\\u0e14\\u0e49"
4735 "\\u0e21\\u0e35\\u0e23\\u0e30\\u0e1a\\u0e1a encoding \\u0e2d\\u0e22\\u0e39\\u0e48\\u0e2b\\u0e25\\u0e32\\u0e22\\u0e23"
4736 "\\u0e49\\u0e2d\\u0e22\\u0e23\\u0e30\\u0e1a\\u0e1a\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e01\\u0e32\\u0e23"
4737 "\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25\\u0e02\\u0e40\\u0e2b\\u0e25\\u0e48"
4738 "\\u0e32\\u0e19\\u0e35\\u0e49. \\u0e44\\u0e21\\u0e48\\u0e21\\u0e35 encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48"
4739 "\\u0e21\\u0e35\\u0e08\\u0e33\\u0e19\\u0e27\\u0e19\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30"
4740 "\\u0e21\\u0e32\\u0e01\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d: \\u0e22\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d"
4741 "\\u0e22\\u0e48\\u0e32\\u0e07\\u0e40\\u0e0a\\u0e48\\u0e19, \\u0e40\\u0e09\\u0e1e\\u0e32\\u0e30\\u0e43\\u0e19\\u0e01"
4742 "\\u0e25\\u0e38\\u0e48\\u0e21\\u0e2a\\u0e2b\\u0e20\\u0e32\\u0e1e\\u0e22\\u0e38\\u0e42\\u0e23\\u0e1b\\u0e40\\u0e1e"
4743 "\\u0e35\\u0e22\\u0e07\\u0e41\\u0e2b\\u0e48\\u0e07\\u0e40\\u0e14\\u0e35\\u0e22\\u0e27 \\u0e01\\u0e47\\u0e15\\u0e49"
4744 "\\u0e2d\\u0e07\\u0e01\\u0e32\\u0e23\\u0e2b\\u0e25\\u0e32\\u0e22 encoding \\u0e43\\u0e19\\u0e01\\u0e32\\u0e23\\u0e04"
4745 "\\u0e23\\u0e2d\\u0e1a\\u0e04\\u0e25\\u0e38\\u0e21\\u0e17\\u0e38\\u0e01\\u0e20\\u0e32\\u0e29\\u0e32\\u0e43\\u0e19"
4746 "\\u0e01\\u0e25\\u0e38\\u0e48\\u0e21. \\u0e2b\\u0e23\\u0e37\\u0e2d\\u0e41\\u0e21\\u0e49\\u0e41\\u0e15\\u0e48\\u0e43"
4747 "\\u0e19\\u0e20\\u0e32\\u0e29\\u0e32\\u0e40\\u0e14\\u0e35\\u0e48\\u0e22\\u0e27 \\u0e40\\u0e0a\\u0e48\\u0e19 \\u0e20"
4748 "\\u0e32\\u0e29\\u0e32\\u0e2d\\u0e31\\u0e07\\u0e01\\u0e24\\u0e29 \\u0e01\\u0e47\\u0e44\\u0e21\\u0e48\\u0e21\\u0e35"
4749 " encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d\\u0e2a\\u0e33\\u0e2b"
4750 "\\u0e23\\u0e31\\u0e1a\\u0e17\\u0e38\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29\\u0e23, \\u0e40\\u0e04"
4751 "\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e27\\u0e23\\u0e23\\u0e04\\u0e15\\u0e2d\\u0e19"
4752 " \\u0e41\\u0e25\\u0e30\\u0e2a\\u0e31\\u0e0d\\u0e25\\u0e31\\u0e01\\u0e29\\u0e13\\u0e4c\\u0e17\\u0e32\\u0e07\\u0e40"
4753 "\\u0e17\\u0e04\\u0e19\\u0e34\\u0e04\\u0e17\\u0e35\\u0e48\\u0e43\\u0e0a\\u0e49\\u0e01\\u0e31\\u0e19\\u0e2d\\u0e22"
4754 "\\u0e39\\u0e48\\u0e17\\u0e31\\u0e48\\u0e27\\u0e44\\u0e1b.";
4755
4756 const char *latinText =
4757 "doy ph\\u1ee5\\u0304\\u0302n \\u1e6d\\u0304h\\u0101n l\\u00e6\\u0302w, khxmphiwtexr\\u0312 ca ke\\u012b\\u0300"
4758 "ywk\\u0304\\u0125xng k\\u1ea1b re\\u1ee5\\u0304\\u0300xng k\\u0304hxng t\\u1ea1wlek\\u0304h. khxmphiwtexr"
4759 "\\u0312 c\\u1ea1d k\\u0115b t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r l\\u00e6a x\\u1ea1kk\\u0304h ra x\\u1ee5\\u0304"
4760 "\\u0300n\\u00ab doy k\\u0101r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304\\u0131\\u0302 s\\u0304"
4761 "\\u1ea3h\\u0304r\\u1ea1b t\\u00e6\\u0300la t\\u1ea1w. k\\u0300xn h\\u0304n\\u0302\\u0101 th\\u012b\\u0300\\u0301"
4762 " Unicode ca t\\u0304h\\u016bk s\\u0304r\\u0302\\u0101ng k\\u0304h\\u1ee5\\u0302n, d\\u1ecb\\u0302 m\\u012b "
4763 "rabb encoding xy\\u016b\\u0300 h\\u0304l\\u0101y r\\u0302xy rabb s\\u0304\\u1ea3h\\u0304r\\u1ea1b k\\u0101"
4764 "r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304el\\u0300\\u0101 n\\u012b\\u0302. m\\u1ecb\\u0300m"
4765 "\\u012b encoding d\\u0131 th\\u012b\\u0300 m\\u012b c\\u1ea3nwn t\\u1ea1w x\\u1ea1kk\\u0304hra m\\u0101k p"
4766 "he\\u012byng phx: yk t\\u1ea1wx\\u1ef3\\u0101ng ch\\u00e8n, c\\u0304heph\\u0101a n\\u0131 kl\\u00f9m s\\u0304"
4767 "h\\u0304p\\u0323h\\u0101ph yurop phe\\u012byng h\\u0304\\u00e6\\u0300ng de\\u012byw k\\u0306 t\\u0302xngk\\u0101"
4768 "r h\\u0304l\\u0101y encoding n\\u0131 k\\u0101r khrxbkhlum thuk p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 n\\u0131"
4769 " kl\\u00f9m. h\\u0304r\\u1ee5\\u0304x m\\u00e6\\u0302t\\u00e6\\u0300 n\\u0131 p\\u0323h\\u0101s\\u0304\\u02b9"
4770 "\\u0101 de\\u012b\\u0300yw ch\\u00e8n p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 x\\u1ea1ngkvs\\u0304\\u02b9 k\\u0306"
4771 " m\\u1ecb\\u0300m\\u012b encoding d\\u0131 th\\u012b\\u0300 phe\\u012byng phx s\\u0304\\u1ea3h\\u0304r\\u1ea1"
4772 "b thuk t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r, kher\\u1ee5\\u0304\\u0300xngh\\u0304m\\u0101y wrrkh txn l\\u00e6"
4773 "a s\\u0304\\u1ea1\\u1ef5l\\u1ea1ks\\u0304\\u02b9\\u1e47\\u0312 th\\u0101ng thekhnikh th\\u012b\\u0300 ch\\u0131"
4774 "\\u0302 k\\u1ea1n xy\\u016b\\u0300 th\\u1ea1\\u0300wp\\u1ecb.";
4775
4776
4777 UnicodeString xlitText(thaiText);
4778 xlitText = xlitText.unescape();
4779 tr->transliterate(xlitText);
4780
4781 UnicodeString expectedText(latinText);
4782 expectedText = expectedText.unescape();
4783 expect(*tr, xlitText, expectedText);
4784
4785 delete tr;
4786 #endif
4787 }
4788
4789
4790 //======================================================================
4791 // Support methods
4792 //======================================================================
expectT(const UnicodeString & id,const UnicodeString & source,const UnicodeString & expectedResult)4793 void TransliteratorTest::expectT(const UnicodeString& id,
4794 const UnicodeString& source,
4795 const UnicodeString& expectedResult) {
4796 UErrorCode ec = U_ZERO_ERROR;
4797 UParseError pe;
4798 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
4799 if (U_FAILURE(ec)) {
4800 errln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(ec));
4801 delete t;
4802 return;
4803 }
4804 expect(*t, source, expectedResult);
4805 delete t;
4806 }
4807
reportParseError(const UnicodeString & message,const UParseError & parseError,const UErrorCode & status)4808 void TransliteratorTest::reportParseError(const UnicodeString& message,
4809 const UParseError& parseError,
4810 const UErrorCode& status) {
4811 dataerrln(message +
4812 /*", parse error " + parseError.code +*/
4813 ", line " + parseError.line +
4814 ", offset " + parseError.offset +
4815 ", pre-context " + prettify(parseError.preContext, true) +
4816 ", post-context " + prettify(parseError.postContext,true) +
4817 ", Error: " + u_errorName(status));
4818 }
4819
expect(const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4820 void TransliteratorTest::expect(const UnicodeString& rules,
4821 const UnicodeString& source,
4822 const UnicodeString& expectedResult,
4823 UTransPosition *pos) {
4824 expect("<ID>", rules, source, expectedResult, pos);
4825 }
4826
expect(const UnicodeString & id,const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4827 void TransliteratorTest::expect(const UnicodeString& id,
4828 const UnicodeString& rules,
4829 const UnicodeString& source,
4830 const UnicodeString& expectedResult,
4831 UTransPosition *pos) {
4832 UErrorCode status = U_ZERO_ERROR;
4833 UParseError parseError;
4834 Transliterator* t = Transliterator::createFromRules(id, rules, UTRANS_FORWARD, parseError, status);
4835 if (U_FAILURE(status)) {
4836 reportParseError(UnicodeString("Couldn't create transliterator from ") + rules, parseError, status);
4837 } else {
4838 expect(*t, source, expectedResult, pos);
4839 }
4840 delete t;
4841 }
4842
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,const Transliterator & reverseTransliterator)4843 void TransliteratorTest::expect(const Transliterator& t,
4844 const UnicodeString& source,
4845 const UnicodeString& expectedResult,
4846 const Transliterator& reverseTransliterator) {
4847 expect(t, source, expectedResult);
4848 expect(reverseTransliterator, expectedResult, source);
4849 }
4850
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4851 void TransliteratorTest::expect(const Transliterator& t,
4852 const UnicodeString& source,
4853 const UnicodeString& expectedResult,
4854 UTransPosition *pos) {
4855 if (pos == 0) {
4856 UnicodeString result(source);
4857 t.transliterate(result);
4858 expectAux(t.getID() + ":String", source, result, expectedResult);
4859 }
4860 UTransPosition index={0, 0, 0, 0};
4861 if (pos != 0) {
4862 index = *pos;
4863 }
4864
4865 UnicodeString rsource(source);
4866 if (pos == 0) {
4867 t.transliterate(rsource);
4868 } else {
4869 // Do it all at once -- below we do it incrementally
4870 t.finishTransliteration(rsource, *pos);
4871 }
4872 expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
4873
4874 // Test keyboard (incremental) transliteration -- this result
4875 // must be the same after we finalize (see below).
4876 UnicodeString log;
4877 rsource.remove();
4878 if (pos != 0) {
4879 rsource = source;
4880 formatInput(log, rsource, index);
4881 log.append(" -> ");
4882 UErrorCode status = U_ZERO_ERROR;
4883 t.transliterate(rsource, index, status);
4884 formatInput(log, rsource, index);
4885 } else {
4886 for (int32_t i=0; i<source.length(); ++i) {
4887 if (i != 0) {
4888 log.append(" + ");
4889 }
4890 log.append(source.charAt(i)).append(" -> ");
4891 UErrorCode status = U_ZERO_ERROR;
4892 t.transliterate(rsource, index, source.charAt(i), status);
4893 formatInput(log, rsource, index);
4894 }
4895 }
4896
4897 // As a final step in keyboard transliteration, we must call
4898 // transliterate to finish off any pending partial matches that
4899 // were waiting for more input.
4900 t.finishTransliteration(rsource, index);
4901 log.append(" => ").append(rsource);
4902
4903 expectAux(t.getID() + ":Keyboard", log,
4904 rsource == expectedResult,
4905 expectedResult);
4906 }
4907
4908
4909 /**
4910 * @param appendTo result is appended to this param.
4911 * @param input the string being transliterated
4912 * @param pos the index struct
4913 */
formatInput(UnicodeString & appendTo,const UnicodeString & input,const UTransPosition & pos)4914 UnicodeString& TransliteratorTest::formatInput(UnicodeString &appendTo,
4915 const UnicodeString& input,
4916 const UTransPosition& pos) {
4917 // Output a string of the form aaa{bbb|ccc|ddd}eee, where
4918 // the {} indicate the context start and limit, and the ||
4919 // indicate the start and limit.
4920 if (0 <= pos.contextStart &&
4921 pos.contextStart <= pos.start &&
4922 pos.start <= pos.limit &&
4923 pos.limit <= pos.contextLimit &&
4924 pos.contextLimit <= input.length()) {
4925
4926 UnicodeString a, b, c, d, e;
4927 input.extractBetween(0, pos.contextStart, a);
4928 input.extractBetween(pos.contextStart, pos.start, b);
4929 input.extractBetween(pos.start, pos.limit, c);
4930 input.extractBetween(pos.limit, pos.contextLimit, d);
4931 input.extractBetween(pos.contextLimit, input.length(), e);
4932 appendTo.append(a).append((char16_t)123/*{*/).append(b).
4933 append((char16_t)PIPE).append(c).append((char16_t)PIPE).append(d).
4934 append((char16_t)125/*}*/).append(e);
4935 } else {
4936 appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
4937 pos.contextStart + ", s=" + pos.start + ", l=" +
4938 pos.limit + ", cl=" + pos.contextLimit + "} on " +
4939 input);
4940 }
4941 return appendTo;
4942 }
4943
expectAux(const UnicodeString & tag,const UnicodeString & source,const UnicodeString & result,const UnicodeString & expectedResult)4944 void TransliteratorTest::expectAux(const UnicodeString& tag,
4945 const UnicodeString& source,
4946 const UnicodeString& result,
4947 const UnicodeString& expectedResult) {
4948 expectAux(tag, source + " -> " + result,
4949 result == expectedResult,
4950 expectedResult);
4951 }
4952
expectAux(const UnicodeString & tag,const UnicodeString & summary,UBool pass,const UnicodeString & expectedResult)4953 void TransliteratorTest::expectAux(const UnicodeString& tag,
4954 const UnicodeString& summary, UBool pass,
4955 const UnicodeString& expectedResult) {
4956 if (pass) {
4957 logln(UnicodeString("(")+tag+") " + prettify(summary));
4958 } else {
4959 dataerrln(UnicodeString("FAIL: (")+tag+") "
4960 + prettify(summary)
4961 + ", expected " + prettify(expectedResult));
4962 }
4963 }
4964
4965 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
4966