1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1999-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 11/10/99 aliu Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "transtst.h"
18 #include "unicode/locid.h"
19 #include "unicode/dtfmtsym.h"
20 #include "unicode/normlzr.h"
21 #include "unicode/translit.h"
22 #include "unicode/uchar.h"
23 #include "unicode/unifilt.h"
24 #include "unicode/uniset.h"
25 #include "unicode/ustring.h"
26 #include "unicode/usetiter.h"
27 #include "unicode/uscript.h"
28 #include "unicode/utf16.h"
29 #include "cpdtrans.h"
30 #include "nultrans.h"
31 #include "rbt.h"
32 #include "rbt_pars.h"
33 #include "anytrans.h"
34 #include "esctrn.h"
35 #include "name2uni.h"
36 #include "nortrans.h"
37 #include "remtrans.h"
38 #include "titletrn.h"
39 #include "tolowtrn.h"
40 #include "toupptrn.h"
41 #include "unesctrn.h"
42 #include "uni2name.h"
43 #include "cstring.h"
44 #include "cmemory.h"
45 #include <stdio.h>
46
47 /***********************************************************************
48
49 HOW TO USE THIS TEST FILE
50 -or-
51 How I developed on two platforms
52 without losing (too much of) my mind
53
54
55 1. Add new tests by copying/pasting/changing existing tests. On Java,
56 any public void method named Test...() taking no parameters becomes
57 a test. On C++, you need to modify the header and add a line to
58 the runIndexedTest() dispatch method.
59
60 2. Make liberal use of the expect() method; it is your friend.
61
62 3. The tests in this file exactly match those in a sister file on the
63 other side. The two files are:
64
65 icu4j: src/com/ibm/test/translit/TransliteratorTest.java
66 icu4c: source/test/intltest/transtst.cpp
67
68 ==> THIS IS THE IMPORTANT PART <==
69
70 When you add a test in this file, add it in TransliteratorTest.java
71 too. Give it the same name and put it in the same relative place.
72 This makes maintenance a lot simpler for any poor soul who ends up
73 trying to synchronize the tests between icu4j and icu4c.
74
75 4. If you MUST enter a test that is NOT paralleled in the sister file,
76 then add it in the special non-mirrored section. These are
77 labeled
78
79 "icu4j ONLY"
80
81 or
82
83 "icu4c ONLY"
84
85 Make sure you document the reason the test is here and not there.
86
87
88 Thank you.
89 The Management
90 ***********************************************************************/
91
92 // Define character constants thusly to be EBCDIC-friendly
93 enum {
94 LEFT_BRACE=((UChar)0x007B), /*{*/
95 PIPE =((UChar)0x007C), /*|*/
96 ZERO =((UChar)0x0030), /*0*/
97 UPPER_A =((UChar)0x0041) /*A*/
98 };
99
TransliteratorTest()100 TransliteratorTest::TransliteratorTest()
101 : DESERET_DEE((UChar32)0x10414),
102 DESERET_dee((UChar32)0x1043C)
103 {
104 }
105
~TransliteratorTest()106 TransliteratorTest::~TransliteratorTest() {}
107
108 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)109 TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
110 const char* &name, char* /*par*/) {
111 switch (index) {
112 TESTCASE(0,TestInstantiation);
113 TESTCASE(1,TestSimpleRules);
114 TESTCASE(2,TestRuleBasedInverse);
115 TESTCASE(3,TestKeyboard);
116 TESTCASE(4,TestKeyboard2);
117 TESTCASE(5,TestKeyboard3);
118 TESTCASE(6,TestArabic);
119 TESTCASE(7,TestCompoundKana);
120 TESTCASE(8,TestCompoundHex);
121 TESTCASE(9,TestFiltering);
122 TESTCASE(10,TestInlineSet);
123 TESTCASE(11,TestPatternQuoting);
124 TESTCASE(12,TestJ277);
125 TESTCASE(13,TestJ243);
126 TESTCASE(14,TestJ329);
127 TESTCASE(15,TestSegments);
128 TESTCASE(16,TestCursorOffset);
129 TESTCASE(17,TestArbitraryVariableValues);
130 TESTCASE(18,TestPositionHandling);
131 TESTCASE(19,TestHiraganaKatakana);
132 TESTCASE(20,TestCopyJ476);
133 TESTCASE(21,TestAnchors);
134 TESTCASE(22,TestInterIndic);
135 TESTCASE(23,TestFilterIDs);
136 TESTCASE(24,TestCaseMap);
137 TESTCASE(25,TestNameMap);
138 TESTCASE(26,TestLiberalizedID);
139 TESTCASE(27,TestCreateInstance);
140 TESTCASE(28,TestNormalizationTransliterator);
141 TESTCASE(29,TestCompoundRBT);
142 TESTCASE(30,TestCompoundFilter);
143 TESTCASE(31,TestRemove);
144 TESTCASE(32,TestToRules);
145 TESTCASE(33,TestContext);
146 TESTCASE(34,TestSupplemental);
147 TESTCASE(35,TestQuantifier);
148 TESTCASE(36,TestSTV);
149 TESTCASE(37,TestCompoundInverse);
150 TESTCASE(38,TestNFDChainRBT);
151 TESTCASE(39,TestNullInverse);
152 TESTCASE(40,TestAliasInverseID);
153 TESTCASE(41,TestCompoundInverseID);
154 TESTCASE(42,TestUndefinedVariable);
155 TESTCASE(43,TestEmptyContext);
156 TESTCASE(44,TestCompoundFilterID);
157 TESTCASE(45,TestPropertySet);
158 TESTCASE(46,TestNewEngine);
159 TESTCASE(47,TestQuantifiedSegment);
160 TESTCASE(48,TestDevanagariLatinRT);
161 TESTCASE(49,TestTeluguLatinRT);
162 TESTCASE(50,TestCompoundLatinRT);
163 TESTCASE(51,TestSanskritLatinRT);
164 TESTCASE(52,TestLocaleInstantiation);
165 TESTCASE(53,TestTitleAccents);
166 TESTCASE(54,TestLocaleResource);
167 TESTCASE(55,TestParseError);
168 TESTCASE(56,TestOutputSet);
169 TESTCASE(57,TestVariableRange);
170 TESTCASE(58,TestInvalidPostContext);
171 TESTCASE(59,TestIDForms);
172 TESTCASE(60,TestToRulesMark);
173 TESTCASE(61,TestEscape);
174 TESTCASE(62,TestAnchorMasking);
175 TESTCASE(63,TestDisplayName);
176 TESTCASE(64,TestSpecialCases);
177 #if !UCONFIG_NO_FILE_IO
178 TESTCASE(65,TestIncrementalProgress);
179 #endif
180 TESTCASE(66,TestSurrogateCasing);
181 TESTCASE(67,TestFunction);
182 TESTCASE(68,TestInvalidBackRef);
183 TESTCASE(69,TestMulticharStringSet);
184 TESTCASE(70,TestUserFunction);
185 TESTCASE(71,TestAnyX);
186 TESTCASE(72,TestSourceTargetSet);
187 TESTCASE(73,TestGurmukhiDevanagari);
188 TESTCASE(74,TestPatternWhiteSpace);
189 TESTCASE(75,TestAllCodepoints);
190 TESTCASE(76,TestBoilerplate);
191 TESTCASE(77,TestAlternateSyntax);
192 TESTCASE(78,TestBeginEnd);
193 TESTCASE(79,TestBeginEndToRules);
194 TESTCASE(80,TestRegisterAlias);
195 TESTCASE(81,TestRuleStripping);
196 TESTCASE(82,TestHalfwidthFullwidth);
197 TESTCASE(83,TestThai);
198 TESTCASE(84,TestAny);
199 TESTCASE(85,TestBasicTransliteratorEvenWithoutData);
200 default: name = ""; break;
201 }
202 }
203
204 /**
205 * Make sure every system transliterator can be instantiated.
206 *
207 * ALSO test that the result of toRules() for each rule is a valid
208 * rule. Do this here so we don't have to have another test that
209 * instantiates everything as well.
210 */
TestInstantiation()211 void TransliteratorTest::TestInstantiation() {
212 UErrorCode ec = U_ZERO_ERROR;
213 StringEnumeration* avail = Transliterator::getAvailableIDs(ec);
214 assertSuccess("getAvailableIDs()", ec);
215 assertTrue("getAvailableIDs()!=NULL", avail!=NULL);
216 int32_t n = Transliterator::countAvailableIDs();
217 assertTrue("getAvailableIDs().count()==countAvailableIDs()",
218 avail->count(ec) == n);
219 assertSuccess("count()", ec);
220 UnicodeString name;
221 for (int32_t i=0; i<n; ++i) {
222 const UnicodeString& id = *avail->snext(ec);
223 if (!assertSuccess("snext()", ec) ||
224 !assertTrue("snext()!=NULL", (&id)!=NULL, true)) {
225 break;
226 }
227 UnicodeString id2 = Transliterator::getAvailableID(i);
228 if (id.length() < 1) {
229 errln(UnicodeString("FAIL: getAvailableID(") +
230 i + ") returned empty string");
231 continue;
232 }
233 if (id != id2) {
234 errln(UnicodeString("FAIL: getAvailableID(") +
235 i + ") != getAvailableIDs().snext()");
236 continue;
237 }
238 UParseError parseError;
239 UErrorCode status = U_ZERO_ERROR;
240 Transliterator* t = Transliterator::createInstance(id,
241 UTRANS_FORWARD, parseError,status);
242 name.truncate(0);
243 Transliterator::getDisplayName(id, name);
244 if (t == 0) {
245 #if UCONFIG_NO_BREAK_ITERATION
246 // If UCONFIG_NO_BREAK_ITERATION is on, then only Thai should fail.
247 if (id.compare((UnicodeString)"Thai-Latn") != 0 &&
248 id.compare((UnicodeString)"Thai-Latin") != 0)
249 #endif
250 dataerrln(UnicodeString("FAIL: Couldn't create ") + id +
251 /*", parse error " + parseError.code +*/
252 ", line " + parseError.line +
253 ", offset " + parseError.offset +
254 ", pre-context " + prettify(parseError.preContext, true) +
255 ", post-context " +prettify(parseError.postContext,true) +
256 ", Error: " + u_errorName(status));
257 // When createInstance fails, it deletes the failing
258 // entry from the available ID list. We detect this
259 // here by looking for a change in countAvailableIDs.
260 int32_t nn = Transliterator::countAvailableIDs();
261 if (nn == (n - 1)) {
262 n = nn;
263 --i; // Compensate for deleted entry
264 }
265 } else {
266 logln(UnicodeString("OK: ") + name + " (" + id + ")");
267
268 // Now test toRules
269 UnicodeString rules;
270 t->toRules(rules, true);
271 Transliterator *u = Transliterator::createFromRules("x",
272 rules, UTRANS_FORWARD, parseError,status);
273 if (u == 0) {
274 errln(UnicodeString("FAIL: ") + id +
275 ".createFromRules() => bad rules" +
276 /*", parse error " + parseError.code +*/
277 ", line " + parseError.line +
278 ", offset " + parseError.offset +
279 ", context " + prettify(parseError.preContext, true) +
280 ", rules: " + prettify(rules, true));
281 } else {
282 delete u;
283 }
284 delete t;
285 }
286 }
287 assertTrue("snext()==NULL", avail->snext(ec)==NULL);
288 assertSuccess("snext()", ec);
289 delete avail;
290
291 // Now test the failure path
292 UParseError parseError;
293 UErrorCode status = U_ZERO_ERROR;
294 UnicodeString id("<Not a valid Transliterator ID>");
295 Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
296 if (t != 0) {
297 errln("FAIL: " + id + " returned a transliterator");
298 delete t;
299 } else {
300 logln("OK: Bogus ID handled properly");
301 }
302 }
303
TestSimpleRules(void)304 void TransliteratorTest::TestSimpleRules(void) {
305 /* Example: rules 1. ab>x|y
306 * 2. yc>z
307 *
308 * []|eabcd start - no match, copy e to translated buffer
309 * [e]|abcd match rule 1 - copy output & adjust cursor
310 * [ex|y]cd match rule 2 - copy output & adjust cursor
311 * [exz]|d no match, copy d to transliterated buffer
312 * [exzd]| done
313 */
314 expect(UnicodeString("ab>x|y;", "") +
315 "yc>z",
316 "eabcd", "exzd");
317
318 /* Another set of rules:
319 * 1. ab>x|yzacw
320 * 2. za>q
321 * 3. qc>r
322 * 4. cw>n
323 *
324 * []|ab Rule 1
325 * [x|yzacw] No match
326 * [xy|zacw] Rule 2
327 * [xyq|cw] Rule 4
328 * [xyqn]| Done
329 */
330 expect(UnicodeString("ab>x|yzacw;") +
331 "za>q;" +
332 "qc>r;" +
333 "cw>n",
334 "ab", "xyqn");
335
336 /* Test categories
337 */
338 UErrorCode status = U_ZERO_ERROR;
339 UParseError parseError;
340 Transliterator *t = Transliterator::createFromRules(
341 "<ID>",
342 UnicodeString("$dummy=").append((UChar)0xE100) +
343 UnicodeString(";"
344 "$vowel=[aeiouAEIOU];"
345 "$lu=[:Lu:];"
346 "$vowel } $lu > '!';"
347 "$vowel > '&';"
348 "'!' { $lu > '^';"
349 "$lu > '*';"
350 "a > ERROR", ""),
351 UTRANS_FORWARD, parseError,
352 status);
353 if (U_FAILURE(status)) {
354 dataerrln("FAIL: RBT constructor failed - %s", u_errorName(status));
355 return;
356 }
357 expect(*t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
358 delete t;
359 }
360
361 /**
362 * Test inline set syntax and set variable syntax.
363 */
TestInlineSet(void)364 void TransliteratorTest::TestInlineSet(void) {
365 expect("{ [:Ll:] } x > y; [:Ll:] > z;", "aAbxq", "zAyzz");
366 expect("a[0-9]b > qrs", "1a7b9", "1qrs9");
367
368 expect(UnicodeString(
369 "$digit = [0-9];"
370 "$alpha = [a-zA-Z];"
371 "$alphanumeric = [$digit $alpha];" // ***
372 "$special = [^$alphanumeric];" // ***
373 "$alphanumeric > '-';"
374 "$special > '*';", ""),
375
376 "thx-1138", "---*----");
377 }
378
379 /**
380 * Create some inverses and confirm that they work. We have to be
381 * careful how we do this, since the inverses will not be true
382 * inverses -- we can't throw any random string at the composition
383 * of the transliterators and expect the identity function. F x
384 * F' != I. However, if we are careful about the input, we will
385 * get the expected results.
386 */
TestRuleBasedInverse(void)387 void TransliteratorTest::TestRuleBasedInverse(void) {
388 UnicodeString RULES =
389 UnicodeString("abc>zyx;") +
390 "ab>yz;" +
391 "bc>zx;" +
392 "ca>xy;" +
393 "a>x;" +
394 "b>y;" +
395 "c>z;" +
396
397 "abc<zyx;" +
398 "ab<yz;" +
399 "bc<zx;" +
400 "ca<xy;" +
401 "a<x;" +
402 "b<y;" +
403 "c<z;" +
404
405 "";
406
407 const char* DATA[] = {
408 // Careful here -- random strings will not work. If we keep
409 // the left side to the domain and the right side to the range
410 // we will be okay though (left, abc; right xyz).
411 "a", "x",
412 "abcacab", "zyxxxyy",
413 "caccb", "xyzzy",
414 };
415
416 int32_t DATA_length = UPRV_LENGTHOF(DATA);
417
418 UErrorCode status = U_ZERO_ERROR;
419 UParseError parseError;
420 Transliterator *fwd = Transliterator::createFromRules("<ID>", RULES,
421 UTRANS_FORWARD, parseError, status);
422 Transliterator *rev = Transliterator::createFromRules("<ID>", RULES,
423 UTRANS_REVERSE, parseError, status);
424 if (U_FAILURE(status)) {
425 errln("FAIL: RBT constructor failed");
426 return;
427 }
428 for (int32_t i=0; i<DATA_length; i+=2) {
429 expect(*fwd, DATA[i], DATA[i+1]);
430 expect(*rev, DATA[i+1], DATA[i]);
431 }
432 delete fwd;
433 delete rev;
434 }
435
436 /**
437 * Basic test of keyboard.
438 */
TestKeyboard(void)439 void TransliteratorTest::TestKeyboard(void) {
440 UParseError parseError;
441 UErrorCode status = U_ZERO_ERROR;
442 Transliterator *t = Transliterator::createFromRules("<ID>",
443 UnicodeString("psch>Y;")
444 +"ps>y;"
445 +"ch>x;"
446 +"a>A;",
447 UTRANS_FORWARD, parseError,
448 status);
449 if (U_FAILURE(status)) {
450 errln("FAIL: RBT constructor failed");
451 return;
452 }
453 const char* DATA[] = {
454 // insertion, buffer
455 "a", "A",
456 "p", "Ap",
457 "s", "Aps",
458 "c", "Apsc",
459 "a", "AycA",
460 "psch", "AycAY",
461 0, "AycAY", // null means finishKeyboardTransliteration
462 };
463
464 keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
465 delete t;
466 }
467
468 /**
469 * Basic test of keyboard with cursor.
470 */
TestKeyboard2(void)471 void TransliteratorTest::TestKeyboard2(void) {
472 UParseError parseError;
473 UErrorCode status = U_ZERO_ERROR;
474 Transliterator *t = Transliterator::createFromRules("<ID>",
475 UnicodeString("ych>Y;")
476 +"ps>|y;"
477 +"ch>x;"
478 +"a>A;",
479 UTRANS_FORWARD, parseError,
480 status);
481 if (U_FAILURE(status)) {
482 errln("FAIL: RBT constructor failed");
483 return;
484 }
485 const char* DATA[] = {
486 // insertion, buffer
487 "a", "A",
488 "p", "Ap",
489 "s", "Aps", // modified for rollback - "Ay",
490 "c", "Apsc", // modified for rollback - "Ayc",
491 "a", "AycA",
492 "p", "AycAp",
493 "s", "AycAps", // modified for rollback - "AycAy",
494 "c", "AycApsc", // modified for rollback - "AycAyc",
495 "h", "AycAY",
496 0, "AycAY", // null means finishKeyboardTransliteration
497 };
498
499 keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
500 delete t;
501 }
502
503 /**
504 * Test keyboard transliteration with back-replacement.
505 */
TestKeyboard3(void)506 void TransliteratorTest::TestKeyboard3(void) {
507 // We want th>z but t>y. Furthermore, during keyboard
508 // transliteration we want t>y then yh>z if t, then h are
509 // typed.
510 UnicodeString RULES("t>|y;"
511 "yh>z;");
512
513 const char* DATA[] = {
514 // Column 1: characters to add to buffer (as if typed)
515 // Column 2: expected appearance of buffer after
516 // keyboard xliteration.
517 "a", "a",
518 "b", "ab",
519 "t", "abt", // modified for rollback - "aby",
520 "c", "abyc",
521 "t", "abyct", // modified for rollback - "abycy",
522 "h", "abycz",
523 0, "abycz", // null means finishKeyboardTransliteration
524 };
525
526 UParseError parseError;
527 UErrorCode status = U_ZERO_ERROR;
528 Transliterator *t = Transliterator::createFromRules("<ID>", RULES, UTRANS_FORWARD, parseError, status);
529 if (U_FAILURE(status)) {
530 errln("FAIL: RBT constructor failed");
531 return;
532 }
533 keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
534 delete t;
535 }
536
keyboardAux(const Transliterator & t,const char * DATA[],int32_t DATA_length)537 void TransliteratorTest::keyboardAux(const Transliterator& t,
538 const char* DATA[], int32_t DATA_length) {
539 UErrorCode status = U_ZERO_ERROR;
540 UTransPosition index={0, 0, 0, 0};
541 UnicodeString s;
542 for (int32_t i=0; i<DATA_length; i+=2) {
543 UnicodeString log;
544 if (DATA[i] != 0) {
545 log = s + " + "
546 + DATA[i]
547 + " -> ";
548 t.transliterate(s, index, DATA[i], status);
549 } else {
550 log = s + " => ";
551 t.finishTransliteration(s, index);
552 }
553 // Show the start index '{' and the cursor '|'
554 UnicodeString a, b, c;
555 s.extractBetween(0, index.contextStart, a);
556 s.extractBetween(index.contextStart, index.start, b);
557 s.extractBetween(index.start, s.length(), c);
558 log.append(a).
559 append((UChar)LEFT_BRACE).
560 append(b).
561 append((UChar)PIPE).
562 append(c);
563 if (s == DATA[i+1] && U_SUCCESS(status)) {
564 logln(log);
565 } else {
566 errln(UnicodeString("FAIL: ") + log + ", expected " + DATA[i+1]);
567 }
568 }
569 }
570
TestArabic(void)571 void TransliteratorTest::TestArabic(void) {
572 // Test disabled for 2.0 until new Arabic transliterator can be written.
573 // /*
574 // const char* DATA[] = {
575 // "Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+
576 // "\u0627\u0644\u0644\u063a\u0629\u0020"+
577 // "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
578 // "\u0628\u0628\u0646\u0638\u0645\u0020"+
579 // "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
580 // "\u062c\u0645\u064a\u0644\u0629",
581 // };
582 // */
583 //
584 // UChar ar_raw[] = {
585 // 0x062a, 0x062a, 0x0645, 0x062a, 0x0639, 0x0020, 0x0627,
586 // 0x0644, 0x0644, 0x063a, 0x0629, 0x0020, 0x0627, 0x0644,
587 // 0x0639, 0x0631, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
588 // 0x0628, 0x0628, 0x0646, 0x0638, 0x0645, 0x0020, 0x0643,
589 // 0x062a, 0x0627, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
590 // 0x062c, 0x0645, 0x064a, 0x0644, 0x0629, 0
591 // };
592 // UnicodeString ar(ar_raw);
593 // UErrorCode status=U_ZERO_ERROR;
594 // UParseError parseError;
595 // Transliterator *t = Transliterator::createInstance("Latin-Arabic", UTRANS_FORWARD, parseError, status);
596 // if (t == 0) {
597 // errln("FAIL: createInstance failed");
598 // return;
599 // }
600 // expect(*t, "Arabic", ar);
601 // delete t;
602 }
603
604 /**
605 * Compose the Kana transliterator forward and reverse and try
606 * some strings that should come out unchanged.
607 */
TestCompoundKana(void)608 void TransliteratorTest::TestCompoundKana(void) {
609 UParseError parseError;
610 UErrorCode status = U_ZERO_ERROR;
611 Transliterator* t = Transliterator::createInstance("Latin-Hiragana;Hiragana-Latin", UTRANS_FORWARD, parseError, status);
612 if (t == 0) {
613 dataerrln("FAIL: construction of Latin-Hiragana;Hiragana-Latin failed - %s", u_errorName(status));
614 } else {
615 expect(*t, "aaaaa", "aaaaa");
616 delete t;
617 }
618 }
619
620 /**
621 * Compose the hex transliterators forward and reverse.
622 */
TestCompoundHex(void)623 void TransliteratorTest::TestCompoundHex(void) {
624 UParseError parseError;
625 UErrorCode status = U_ZERO_ERROR;
626 Transliterator* a = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
627 Transliterator* b = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, parseError, status);
628 Transliterator* transab[] = { a, b };
629 Transliterator* transba[] = { b, a };
630 if (a == 0 || b == 0) {
631 errln("FAIL: construction failed");
632 delete a;
633 delete b;
634 return;
635 }
636 // Do some basic tests of a
637 expect(*a, "01", UnicodeString("\\u0030\\u0031", ""));
638 // Do some basic tests of b
639 expect(*b, UnicodeString("\\u0030\\u0031", ""), "01");
640
641 Transliterator* ab = new CompoundTransliterator(transab, 2);
642 UnicodeString s("abcde", "");
643 expect(*ab, s, s);
644
645 UnicodeString str(s);
646 a->transliterate(str);
647 Transliterator* ba = new CompoundTransliterator(transba, 2);
648 expect(*ba, str, str);
649
650 delete ab;
651 delete ba;
652 delete a;
653 delete b;
654 }
655
656 int gTestFilterClassID = 0;
657 /**
658 * Used by TestFiltering().
659 */
660 class TestFilter : public UnicodeFilter {
clone() const661 virtual TestFilter* clone() const override {
662 return new TestFilter(*this);
663 }
contains(UChar32 c) const664 virtual UBool contains(UChar32 c) const override {
665 return c != (UChar)0x0063 /*c*/;
666 }
667 // Stubs
toPattern(UnicodeString & result,UBool) const668 virtual UnicodeString& toPattern(UnicodeString& result,
669 UBool /*escapeUnprintable*/) const override {
670 return result;
671 }
matchesIndexValue(uint8_t) const672 virtual UBool matchesIndexValue(uint8_t /*v*/) const override {
673 return false;
674 }
addMatchSetTo(UnicodeSet &) const675 virtual void addMatchSetTo(UnicodeSet& /*toUnionTo*/) const override {}
676 public:
getDynamicClassID() const677 UClassID getDynamicClassID() const override { return (UClassID)&gTestFilterClassID; }
678 };
679
680 /**
681 * Do some basic tests of filtering.
682 */
TestFiltering(void)683 void TransliteratorTest::TestFiltering(void) {
684 UParseError parseError;
685 UErrorCode status = U_ZERO_ERROR;
686 Transliterator* hex = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
687 if (hex == 0) {
688 errln("FAIL: createInstance(Any-Hex) failed");
689 return;
690 }
691 hex->adoptFilter(new TestFilter());
692 UnicodeString s("abcde");
693 hex->transliterate(s);
694 UnicodeString exp("\\u0061\\u0062c\\u0064\\u0065", "");
695 if (s == exp) {
696 logln(UnicodeString("Ok: \"") + exp + "\"");
697 } else {
698 logln(UnicodeString("FAIL: \"") + s + "\", wanted \"" + exp + "\"");
699 }
700
701 // ICU4C ONLY. Do not find Transliterator.orphanFilter() in ICU4J.
702 UnicodeFilter *f = hex->orphanFilter();
703 if (f == NULL){
704 errln("FAIL: orphanFilter() should get a UnicodeFilter");
705 } else {
706 delete f;
707 }
708 delete hex;
709 }
710
711 /**
712 * Test anchors
713 */
TestAnchors(void)714 void TransliteratorTest::TestAnchors(void) {
715 expect(UnicodeString("^a > 0; a$ > 2 ; a > 1;", ""),
716 "aaa",
717 "012");
718 expect(UnicodeString("$s=[z$]; $s{a>0; a}$s>2; a>1;", ""),
719 "aaa",
720 "012");
721 expect(UnicodeString("^ab > 01 ;"
722 " ab > |8 ;"
723 " b > k ;"
724 " 8x$ > 45 ;"
725 " 8x > 77 ;", ""),
726
727 "ababbabxabx",
728 "018k7745");
729 expect(UnicodeString("$s = [z$] ;"
730 "$s{ab > 01 ;"
731 " ab > |8 ;"
732 " b > k ;"
733 " 8x}$s > 45 ;"
734 " 8x > 77 ;", ""),
735
736 "abzababbabxzabxabx",
737 "01z018k45z01x45");
738 }
739
740 /**
741 * Test pattern quoting and escape mechanisms.
742 */
TestPatternQuoting(void)743 void TransliteratorTest::TestPatternQuoting(void) {
744 // Array of 3n items
745 // Each item is <rules>, <input>, <expected output>
746 const UnicodeString DATA[] = {
747 UnicodeString(UChar(0x4E01)) + ">'[male adult]'",
748 UnicodeString(UChar(0x4E01)),
749 "[male adult]"
750 };
751
752 for (int32_t i=0; i<3; i+=3) {
753 logln(UnicodeString("Pattern: ") + prettify(DATA[i]));
754 UParseError parseError;
755 UErrorCode status = U_ZERO_ERROR;
756 Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
757 if (U_FAILURE(status)) {
758 errln("RBT constructor failed");
759 } else {
760 expect(*t, DATA[i+1], DATA[i+2]);
761 }
762 delete t;
763 }
764 }
765
766 /**
767 * Regression test for bugs found in Greek transliteration.
768 */
TestJ277(void)769 void TransliteratorTest::TestJ277(void) {
770 UErrorCode status = U_ZERO_ERROR;
771 UParseError parseError;
772 Transliterator *gl = Transliterator::createInstance("Greek-Latin; NFD; [:M:]Remove; NFC", UTRANS_FORWARD, parseError, status);
773 if (gl == NULL) {
774 dataerrln("FAIL: createInstance(Greek-Latin) returned NULL - %s", u_errorName(status));
775 return;
776 }
777
778 UChar sigma = 0x3C3;
779 UChar upsilon = 0x3C5;
780 UChar nu = 0x3BD;
781 // UChar PHI = 0x3A6;
782 UChar alpha = 0x3B1;
783 // UChar omega = 0x3C9;
784 // UChar omicron = 0x3BF;
785 // UChar epsilon = 0x3B5;
786
787 // sigma upsilon nu -> syn
788 UnicodeString syn;
789 syn.append(sigma).append(upsilon).append(nu);
790 expect(*gl, syn, "syn");
791
792 // sigma alpha upsilon nu -> saun
793 UnicodeString sayn;
794 sayn.append(sigma).append(alpha).append(upsilon).append(nu);
795 expect(*gl, sayn, "saun");
796
797 // Again, using a smaller rule set
798 UnicodeString rules(
799 "$alpha = \\u03B1;"
800 "$nu = \\u03BD;"
801 "$sigma = \\u03C3;"
802 "$ypsilon = \\u03C5;"
803 "$vowel = [aeiouAEIOU$alpha$ypsilon];"
804 "s <> $sigma;"
805 "a <> $alpha;"
806 "u <> $vowel { $ypsilon;"
807 "y <> $ypsilon;"
808 "n <> $nu;",
809 "");
810 Transliterator *mini = Transliterator::createFromRules("mini", rules, UTRANS_REVERSE, parseError, status);
811 if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
812 expect(*mini, syn, "syn");
813 expect(*mini, sayn, "saun");
814 delete mini;
815 mini = NULL;
816
817 #if !UCONFIG_NO_FORMATTING
818 // Transliterate the Greek locale data
819 Locale el("el");
820 DateFormatSymbols syms(el, status);
821 if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
822 int32_t i, count;
823 const UnicodeString* data = syms.getMonths(count);
824 for (i=0; i<count; ++i) {
825 if (data[i].length() == 0) {
826 continue;
827 }
828 UnicodeString out(data[i]);
829 gl->transliterate(out);
830 UBool ok = true;
831 if (data[i].length() >= 2 && out.length() >= 2 &&
832 u_isupper(data[i].charAt(0)) && u_islower(data[i].charAt(1))) {
833 if (!(u_isupper(out.charAt(0)) && u_islower(out.charAt(1)))) {
834 ok = false;
835 }
836 }
837 if (ok) {
838 logln(prettify(data[i] + " -> " + out));
839 } else {
840 errln(UnicodeString("FAIL: ") + prettify(data[i] + " -> " + out));
841 }
842 }
843 #endif
844
845 delete gl;
846 }
847
848 /**
849 * Prefix, suffix support in hex transliterators
850 */
TestJ243(void)851 void TransliteratorTest::TestJ243(void) {
852 UErrorCode ec = U_ZERO_ERROR;
853
854 // Test default Hex-Any, which should handle
855 // \u, \U, u+, and U+
856 Transliterator *hex =
857 Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, ec);
858 if (assertSuccess("getInstance", ec)) {
859 expect(*hex, UnicodeString("\\u0041+\\U00000042,U+0043uU+0044z", ""), "A+B,CuDz");
860 }
861 delete hex;
862
863 // // Try a custom Hex-Unicode
864 // // \uXXXX and &#xXXXX;
865 // ec = U_ZERO_ERROR;
866 // HexToUnicodeTransliterator hex2(UnicodeString("\\\\u###0;&\\#x###0\\;", ""), ec);
867 // expect(hex2, UnicodeString("\\u61\\u062\\u0063\\u00645\\u66x0123", ""),
868 // "abcd5fx0123");
869 // // Try custom Any-Hex (default is tested elsewhere)
870 // ec = U_ZERO_ERROR;
871 // UnicodeToHexTransliterator hex3(UnicodeString("&\\#x###0;", ""), ec);
872 // expect(hex3, "012", "012");
873 }
874
875 /**
876 * Parsers need better syntax error messages.
877 */
TestJ329(void)878 void TransliteratorTest::TestJ329(void) {
879
880 struct { UBool containsErrors; const char* rule; } DATA[] = {
881 { false, "a > b; c > d" },
882 { true, "a > b; no operator; c > d" },
883 };
884 int32_t DATA_length = UPRV_LENGTHOF(DATA);
885
886 for (int32_t i=0; i<DATA_length; ++i) {
887 UErrorCode status = U_ZERO_ERROR;
888 UParseError parseError;
889 Transliterator *rbt = Transliterator::createFromRules("<ID>",
890 DATA[i].rule,
891 UTRANS_FORWARD,
892 parseError,
893 status);
894 UBool gotError = U_FAILURE(status);
895 UnicodeString desc(DATA[i].rule);
896 desc.append(gotError ? " -> error" : " -> no error");
897 if (gotError) {
898 desc = desc + ", ParseError code=" + u_errorName(status) +
899 " line=" + parseError.line +
900 " offset=" + parseError.offset +
901 " context=" + parseError.preContext;
902 }
903 if (gotError == DATA[i].containsErrors) {
904 logln(UnicodeString("Ok: ") + desc);
905 } else {
906 errln(UnicodeString("FAIL: ") + desc);
907 }
908 delete rbt;
909 }
910 }
911
912 /**
913 * Test segments and segment references.
914 */
TestSegments(void)915 void TransliteratorTest::TestSegments(void) {
916 // Array of 3n items
917 // Each item is <rules>, <input>, <expected output>
918 UnicodeString DATA[] = {
919 "([a-z]) '.' ([0-9]) > $2 '-' $1",
920 "abc.123.xyz.456",
921 "ab1-c23.xy4-z56",
922
923 // nested
924 "(([a-z])([0-9])) > $1 '.' $2 '.' $3;",
925 "a1 b2",
926 "a1.a.1 b2.b.2",
927 };
928 int32_t DATA_length = UPRV_LENGTHOF(DATA);
929
930 for (int32_t i=0; i<DATA_length; i+=3) {
931 logln("Pattern: " + prettify(DATA[i]));
932 UParseError parseError;
933 UErrorCode status = U_ZERO_ERROR;
934 Transliterator *t = Transliterator::createFromRules("ID", DATA[i], UTRANS_FORWARD, parseError, status);
935 if (U_FAILURE(status)) {
936 errln("FAIL: RBT constructor");
937 } else {
938 expect(*t, DATA[i+1], DATA[i+2]);
939 }
940 delete t;
941 }
942 }
943
944 /**
945 * Test cursor positioning outside of the key
946 */
TestCursorOffset(void)947 void TransliteratorTest::TestCursorOffset(void) {
948 // Array of 3n items
949 // Each item is <rules>, <input>, <expected output>
950 UnicodeString DATA[] = {
951 "pre {alpha} post > | @ ALPHA ;"
952 "eALPHA > beta ;"
953 "pre {beta} post > BETA @@ | ;"
954 "post > xyz",
955
956 "prealphapost prebetapost",
957
958 "prbetaxyz preBETApost",
959 };
960 int32_t DATA_length = UPRV_LENGTHOF(DATA);
961
962 for (int32_t i=0; i<DATA_length; i+=3) {
963 logln("Pattern: " + prettify(DATA[i]));
964 UParseError parseError;
965 UErrorCode status = U_ZERO_ERROR;
966 Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
967 if (U_FAILURE(status)) {
968 errln("FAIL: RBT constructor");
969 } else {
970 expect(*t, DATA[i+1], DATA[i+2]);
971 }
972 delete t;
973 }
974 }
975
976 /**
977 * Test zero length and > 1 char length variable values. Test
978 * use of variable refs in UnicodeSets.
979 */
TestArbitraryVariableValues(void)980 void TransliteratorTest::TestArbitraryVariableValues(void) {
981 // Array of 3n items
982 // Each item is <rules>, <input>, <expected output>
983 UnicodeString DATA[] = {
984 "$abe = ab;"
985 "$pat = x[yY]z;"
986 "$ll = 'a-z';"
987 "$llZ = [$ll];"
988 "$llY = [$ll$pat];"
989 "$emp = ;"
990
991 "$abe > ABE;"
992 "$pat > END;"
993 "$llZ > 1;"
994 "$llY > 2;"
995 "7$emp 8 > 9;"
996 "",
997
998 "ab xYzxyz stY78",
999 "ABE ENDEND 1129",
1000 };
1001 int32_t DATA_length = UPRV_LENGTHOF(DATA);
1002
1003 for (int32_t i=0; i<DATA_length; i+=3) {
1004 logln("Pattern: " + prettify(DATA[i]));
1005 UParseError parseError;
1006 UErrorCode status = U_ZERO_ERROR;
1007 Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
1008 if (U_FAILURE(status)) {
1009 errln("FAIL: RBT constructor");
1010 } else {
1011 expect(*t, DATA[i+1], DATA[i+2]);
1012 }
1013 delete t;
1014 }
1015 }
1016
1017 /**
1018 * Confirm that the contextStart, contextLimit, start, and limit
1019 * behave correctly. J474.
1020 */
TestPositionHandling(void)1021 void TransliteratorTest::TestPositionHandling(void) {
1022 // Array of 3n items
1023 // Each item is <rules>, <input>, <expected output>
1024 const char* DATA[] = {
1025 "a{t} > SS ; {t}b > UU ; {t} > TT ;",
1026 "xtat txtb", // pos 0,9,0,9
1027 "xTTaSS TTxUUb",
1028
1029 "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1030 "xtat txtb", // pos 2,9,3,8
1031 "xtaSS TTxUUb",
1032
1033 "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1034 "xtat txtb", // pos 3,8,3,8
1035 "xtaTT TTxTTb",
1036 };
1037
1038 // Array of 4n positions -- these go with the DATA array
1039 // They are: contextStart, contextLimit, start, limit
1040 int32_t POS[] = {
1041 0, 9, 0, 9,
1042 2, 9, 3, 8,
1043 3, 8, 3, 8,
1044 };
1045
1046 int32_t n = UPRV_LENGTHOF(DATA) / 3;
1047 for (int32_t i=0; i<n; i++) {
1048 UErrorCode status = U_ZERO_ERROR;
1049 UParseError parseError;
1050 Transliterator *t = Transliterator::createFromRules("<ID>",
1051 DATA[3*i], UTRANS_FORWARD, parseError, status);
1052 if (U_FAILURE(status)) {
1053 delete t;
1054 errln("FAIL: RBT constructor");
1055 return;
1056 }
1057 UTransPosition pos;
1058 pos.contextStart= POS[4*i];
1059 pos.contextLimit = POS[4*i+1];
1060 pos.start = POS[4*i+2];
1061 pos.limit = POS[4*i+3];
1062 UnicodeString rsource(DATA[3*i+1]);
1063 t->transliterate(rsource, pos, status);
1064 if (U_FAILURE(status)) {
1065 delete t;
1066 errln("FAIL: transliterate");
1067 return;
1068 }
1069 t->finishTransliteration(rsource, pos);
1070 expectAux(DATA[3*i],
1071 DATA[3*i+1],
1072 rsource,
1073 DATA[3*i+2]);
1074 delete t;
1075 }
1076 }
1077
1078 /**
1079 * Test the Hiragana-Katakana transliterator.
1080 */
TestHiraganaKatakana(void)1081 void TransliteratorTest::TestHiraganaKatakana(void) {
1082 UParseError parseError;
1083 UErrorCode status = U_ZERO_ERROR;
1084 Transliterator* hk = Transliterator::createInstance("Hiragana-Katakana", UTRANS_FORWARD, parseError, status);
1085 Transliterator* kh = Transliterator::createInstance("Katakana-Hiragana", UTRANS_FORWARD, parseError, status);
1086 if (hk == 0 || kh == 0) {
1087 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1088 delete hk;
1089 delete kh;
1090 return;
1091 }
1092
1093 // Array of 3n items
1094 // Each item is "hk"|"kh"|"both", <Hiragana>, <Katakana>
1095 const char* DATA[] = {
1096 "both",
1097 "\\u3042\\u3090\\u3099\\u3092\\u3050",
1098 "\\u30A2\\u30F8\\u30F2\\u30B0",
1099
1100 "kh",
1101 "\\u307C\\u3051\\u3060\\u3042\\u3093\\u30FC",
1102 "\\u30DC\\u30F6\\u30C0\\u30FC\\u30F3\\u30FC",
1103 };
1104 int32_t DATA_length = UPRV_LENGTHOF(DATA);
1105
1106 for (int32_t i=0; i<DATA_length; i+=3) {
1107 UnicodeString h = CharsToUnicodeString(DATA[i+1]);
1108 UnicodeString k = CharsToUnicodeString(DATA[i+2]);
1109 switch (*DATA[i]) {
1110 case 0x68: //'h': // Hiragana-Katakana
1111 expect(*hk, h, k);
1112 break;
1113 case 0x6B: //'k': // Katakana-Hiragana
1114 expect(*kh, k, h);
1115 break;
1116 case 0x62: //'b': // both
1117 expect(*hk, h, k);
1118 expect(*kh, k, h);
1119 break;
1120 }
1121 }
1122 delete hk;
1123 delete kh;
1124 }
1125
1126 /**
1127 * Test cloning / copy constructor of RBT.
1128 */
TestCopyJ476(void)1129 void TransliteratorTest::TestCopyJ476(void) {
1130 // The real test here is what happens when the destructors are
1131 // called. So we let one object get destructed, and check to
1132 // see that its copy still works.
1133 Transliterator *t2 = 0;
1134 {
1135 UParseError parseError;
1136 UErrorCode status = U_ZERO_ERROR;
1137 Transliterator *t1 = Transliterator::createFromRules("t1",
1138 "a>A;b>B;'foo'+>'bar'", UTRANS_FORWARD, parseError, status);
1139 if (U_FAILURE(status)) {
1140 errln("FAIL: RBT constructor");
1141 return;
1142 }
1143 t2 = t1->clone(); // Call copy constructor under the covers.
1144 expect(*t1, "abcfoofoo", "ABcbar");
1145 delete t1;
1146 }
1147 expect(*t2, "abcfoofoo", "ABcbar");
1148 delete t2;
1149 }
1150
1151 /**
1152 * Test inter-Indic transliterators. These are composed.
1153 * ICU4C Jitterbug 483.
1154 */
TestInterIndic(void)1155 void TransliteratorTest::TestInterIndic(void) {
1156 UnicodeString ID("Devanagari-Gujarati", "");
1157 UErrorCode status = U_ZERO_ERROR;
1158 UParseError parseError;
1159 Transliterator* dg = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1160 if (dg == 0) {
1161 dataerrln("FAIL: createInstance(" + ID + ") returned NULL - " + u_errorName(status));
1162 return;
1163 }
1164 UnicodeString id = dg->getID();
1165 if (id != ID) {
1166 errln("FAIL: createInstance(" + ID + ")->getID() => " + id);
1167 }
1168 UnicodeString dev = CharsToUnicodeString("\\u0901\\u090B\\u0925");
1169 UnicodeString guj = CharsToUnicodeString("\\u0A81\\u0A8B\\u0AA5");
1170 expect(*dg, dev, guj);
1171 delete dg;
1172 }
1173
1174 /**
1175 * Test filter syntax in IDs. (J918)
1176 */
TestFilterIDs(void)1177 void TransliteratorTest::TestFilterIDs(void) {
1178 // Array of 3n strings:
1179 // <id>, <inverse id>, <input>, <expected output>
1180 const char* DATA[] = {
1181 "[aeiou]Any-Hex", // ID
1182 "[aeiou]Hex-Any", // expected inverse ID
1183 "quizzical", // src
1184 "q\\u0075\\u0069zz\\u0069c\\u0061l", // expected ID.translit(src)
1185
1186 "[aeiou]Any-Hex;[^5]Hex-Any",
1187 "[^5]Any-Hex;[aeiou]Hex-Any",
1188 "quizzical",
1189 "q\\u0075izzical",
1190
1191 "[abc]Null",
1192 "[abc]Null",
1193 "xyz",
1194 "xyz",
1195 };
1196 enum { DATA_length = UPRV_LENGTHOF(DATA) };
1197
1198 for (int i=0; i<DATA_length; i+=4) {
1199 UnicodeString ID(DATA[i], "");
1200 UnicodeString uID(DATA[i+1], "");
1201 UnicodeString data2(DATA[i+2], "");
1202 UnicodeString data3(DATA[i+3], "");
1203 UParseError parseError;
1204 UErrorCode status = U_ZERO_ERROR;
1205 Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1206 if (t == 0) {
1207 errln("FAIL: createInstance(" + ID + ") returned NULL");
1208 return;
1209 }
1210 expect(*t, data2, data3);
1211
1212 // Check the ID
1213 if (ID != t->getID()) {
1214 errln("FAIL: createInstance(" + ID + ").getID() => " +
1215 t->getID());
1216 }
1217
1218 // Check the inverse
1219 Transliterator *u = t->createInverse(status);
1220 if (u == 0) {
1221 errln("FAIL: " + ID + ".createInverse() returned NULL");
1222 } else if (u->getID() != uID) {
1223 errln("FAIL: " + ID + ".createInverse().getID() => " +
1224 u->getID() + ", expected " + uID);
1225 }
1226
1227 delete t;
1228 delete u;
1229 }
1230 }
1231
1232 /**
1233 * Test the case mapping transliterators.
1234 */
TestCaseMap(void)1235 void TransliteratorTest::TestCaseMap(void) {
1236 UParseError parseError;
1237 UErrorCode status = U_ZERO_ERROR;
1238 Transliterator* toUpper =
1239 Transliterator::createInstance("Any-Upper[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1240 Transliterator* toLower =
1241 Transliterator::createInstance("Any-Lower[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1242 Transliterator* toTitle =
1243 Transliterator::createInstance("Any-Title[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1244 if (toUpper==0 || toLower==0 || toTitle==0) {
1245 errln("FAIL: createInstance returned NULL");
1246 delete toUpper;
1247 delete toLower;
1248 delete toTitle;
1249 return;
1250 }
1251
1252 expect(*toUpper, "The quick brown fox jumped over the lazy dogs.",
1253 "THE QUICK BROWN FOx JUMPED OVER THE LAzy DOGS.");
1254 expect(*toLower, "The quIck brown fOX jUMPED OVER THE LAzY dogs.",
1255 "the quick brown foX jumped over the lazY dogs.");
1256 expect(*toTitle, "the quick brown foX can't jump over the laZy dogs.",
1257 "The Quick Brown FoX Can't Jump Over The LaZy Dogs.");
1258
1259 delete toUpper;
1260 delete toLower;
1261 delete toTitle;
1262 }
1263
1264 /**
1265 * Test the name mapping transliterators.
1266 */
TestNameMap(void)1267 void TransliteratorTest::TestNameMap(void) {
1268 UParseError parseError;
1269 UErrorCode status = U_ZERO_ERROR;
1270 Transliterator* uni2name =
1271 Transliterator::createInstance("Any-Name[^abc]", UTRANS_FORWARD, parseError, status);
1272 Transliterator* name2uni =
1273 Transliterator::createInstance("Name-Any", UTRANS_FORWARD, parseError, status);
1274 if (uni2name==0 || name2uni==0) {
1275 errln("FAIL: createInstance returned NULL");
1276 delete uni2name;
1277 delete name2uni;
1278 return;
1279 }
1280
1281 // Careful: CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1282 expect(*uni2name, CharsToUnicodeString("\\u00A0abc\\u4E01\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF"),
1283 CharsToUnicodeString("\\\\N{NO-BREAK SPACE}abc\\\\N{CJK UNIFIED IDEOGRAPH-4E01}\\\\N{MICRO SIGN}\\\\N{GUJARATI SIGN CANDRABINDU}\\\\N{REPLACEMENT CHARACTER}\\\\N{<control-0004>}\\\\N{<control-0009>}\\\\N{<control-0081>}\\\\N{<noncharacter-FFFF>}"));
1284 expect(*name2uni, UNICODE_STRING_SIMPLE("{\\N { NO-BREAK SPACE}abc\\N{ CJK UNIFIED IDEOGRAPH-4E01 }\\N{x\\N{MICRO SIGN}\\N{GUJARATI SIGN CANDRABINDU}\\N{REPLACEMENT CHARACTER}\\N{<control-0004>}\\N{<control-0009>}\\N{<control-0081>}\\N{<noncharacter-FFFF>}\\N{<control-0004>}\\N{"),
1285 CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{"));
1286
1287 delete uni2name;
1288 delete name2uni;
1289
1290 // round trip
1291 Transliterator* t =
1292 Transliterator::createInstance("Any-Name;Name-Any", UTRANS_FORWARD, parseError, status);
1293 if (t==0) {
1294 errln("FAIL: createInstance returned NULL");
1295 delete t;
1296 return;
1297 }
1298
1299 // Careful: CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1300 UnicodeString s = CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{");
1301 expect(*t, s, s);
1302 delete t;
1303 }
1304
1305 /**
1306 * Test liberalized ID syntax. 1006c
1307 */
TestLiberalizedID(void)1308 void TransliteratorTest::TestLiberalizedID(void) {
1309 // Some test cases have an expected getID() value of NULL. This
1310 // means I have disabled the test case for now. This stuff is
1311 // still under development, and I haven't decided whether to make
1312 // getID() return canonical case yet. It will all get rewritten
1313 // with the move to Source-Target/Variant IDs anyway. [aliu]
1314 const char* DATA[] = {
1315 "latin-greek", NULL /*"Latin-Greek"*/, "case insensitivity",
1316 " Null ", "Null", "whitespace",
1317 " Latin[a-z]-Greek ", "[a-z]Latin-Greek", "inline filter",
1318 " null ; latin-greek ", NULL /*"Null;Latin-Greek"*/, "compound whitespace",
1319 };
1320 const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1321 UParseError parseError;
1322 UErrorCode status= U_ZERO_ERROR;
1323 for (int32_t i=0; i<DATA_length; i+=3) {
1324 Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, parseError, status);
1325 if (t == 0) {
1326 dataerrln(UnicodeString("FAIL: ") + DATA[i+2] +
1327 " cannot create ID \"" + DATA[i] + "\" - " + u_errorName(status));
1328 } else {
1329 UnicodeString exp;
1330 if (DATA[i+1]) {
1331 exp = UnicodeString(DATA[i+1], "");
1332 }
1333 // Don't worry about getID() if the expected char*
1334 // is NULL -- see above.
1335 if (exp.length() == 0 || exp == t->getID()) {
1336 logln(UnicodeString("Ok: ") + DATA[i+2] +
1337 " create ID \"" + DATA[i] + "\" => \"" +
1338 exp + "\"");
1339 } else {
1340 errln(UnicodeString("FAIL: ") + DATA[i+2] +
1341 " create ID \"" + DATA[i] + "\" => \"" +
1342 t->getID() + "\", exp \"" + exp + "\"");
1343 }
1344 delete t;
1345 }
1346 }
1347 }
1348
1349 /* test for Jitterbug 912 */
TestCreateInstance()1350 void TransliteratorTest::TestCreateInstance(){
1351 const char* FORWARD = "F";
1352 const char* REVERSE = "R";
1353 const char* DATA[] = {
1354 // Column 1: id
1355 // Column 2: direction
1356 // Column 3: expected ID, or "" if expect failure
1357 "Latin-Hangul", REVERSE, "Hangul-Latin", // JB#912
1358
1359 // JB#2689: bad compound causes crash
1360 "InvalidSource-InvalidTarget", FORWARD, "",
1361 "InvalidSource-InvalidTarget", REVERSE, "",
1362 "Hex-Any;InvalidSource-InvalidTarget", FORWARD, "",
1363 "Hex-Any;InvalidSource-InvalidTarget", REVERSE, "",
1364 "InvalidSource-InvalidTarget;Hex-Any", FORWARD, "",
1365 "InvalidSource-InvalidTarget;Hex-Any", REVERSE, "",
1366
1367 NULL
1368 };
1369
1370 for (int32_t i=0; DATA[i]; i+=3) {
1371 UParseError err;
1372 UErrorCode ec = U_ZERO_ERROR;
1373 UnicodeString id(DATA[i]);
1374 UTransDirection dir = (DATA[i+1]==FORWARD)?
1375 UTRANS_FORWARD:UTRANS_REVERSE;
1376 UnicodeString expID(DATA[i+2]);
1377 Transliterator* t =
1378 Transliterator::createInstance(id,dir,err,ec);
1379 UnicodeString newID;
1380 if (t) {
1381 newID = t->getID();
1382 }
1383 UBool ok = (newID == expID);
1384 if (!t) {
1385 newID = u_errorName(ec);
1386 }
1387 if (ok) {
1388 logln((UnicodeString)"Ok: createInstance(" +
1389 id + "," + DATA[i+1] + ") => " + newID);
1390 } else {
1391 dataerrln((UnicodeString)"FAIL: createInstance(" +
1392 id + "," + DATA[i+1] + ") => " + newID +
1393 ", expected " + expID);
1394 }
1395 delete t;
1396 }
1397 }
1398
1399 /**
1400 * Test the normalization transliterator.
1401 */
TestNormalizationTransliterator()1402 void TransliteratorTest::TestNormalizationTransliterator() {
1403 // THE FOLLOWING TWO TABLES ARE COPIED FROM com.ibm.test.normalizer.BasicTest
1404 // PLEASE KEEP THEM IN SYNC WITH BasicTest.
1405 const char* CANON[] = {
1406 // Input Decomposed Composed
1407 "cat", "cat", "cat" ,
1408 "\\u00e0ardvark", "a\\u0300ardvark", "\\u00e0ardvark" ,
1409
1410 "\\u1e0a", "D\\u0307", "\\u1e0a" , // D-dot_above
1411 "D\\u0307", "D\\u0307", "\\u1e0a" , // D dot_above
1412
1413 "\\u1e0c\\u0307", "D\\u0323\\u0307", "\\u1e0c\\u0307" , // D-dot_below dot_above
1414 "\\u1e0a\\u0323", "D\\u0323\\u0307", "\\u1e0c\\u0307" , // D-dot_above dot_below
1415 "D\\u0307\\u0323", "D\\u0323\\u0307", "\\u1e0c\\u0307" , // D dot_below dot_above
1416
1417 "\\u1e10\\u0307\\u0323", "D\\u0327\\u0323\\u0307","\\u1e10\\u0323\\u0307", // D dot_below cedilla dot_above
1418 "D\\u0307\\u0328\\u0323","D\\u0328\\u0323\\u0307","\\u1e0c\\u0328\\u0307", // D dot_above ogonek dot_below
1419
1420 "\\u1E14", "E\\u0304\\u0300", "\\u1E14" , // E-macron-grave
1421 "\\u0112\\u0300", "E\\u0304\\u0300", "\\u1E14" , // E-macron + grave
1422 "\\u00c8\\u0304", "E\\u0300\\u0304", "\\u00c8\\u0304" , // E-grave + macron
1423
1424 "\\u212b", "A\\u030a", "\\u00c5" , // angstrom_sign
1425 "\\u00c5", "A\\u030a", "\\u00c5" , // A-ring
1426
1427 "\\u00fdffin", "y\\u0301ffin", "\\u00fdffin" , //updated with 3.0
1428 "\\u00fd\\uFB03n", "y\\u0301\\uFB03n", "\\u00fd\\uFB03n" , //updated with 3.0
1429
1430 "Henry IV", "Henry IV", "Henry IV" ,
1431 "Henry \\u2163", "Henry \\u2163", "Henry \\u2163" ,
1432
1433 "\\u30AC", "\\u30AB\\u3099", "\\u30AC" , // ga (Katakana)
1434 "\\u30AB\\u3099", "\\u30AB\\u3099", "\\u30AC" , // ka + ten
1435 "\\uFF76\\uFF9E", "\\uFF76\\uFF9E", "\\uFF76\\uFF9E" , // hw_ka + hw_ten
1436 "\\u30AB\\uFF9E", "\\u30AB\\uFF9E", "\\u30AB\\uFF9E" , // ka + hw_ten
1437 "\\uFF76\\u3099", "\\uFF76\\u3099", "\\uFF76\\u3099" , // hw_ka + ten
1438
1439 "A\\u0300\\u0316", "A\\u0316\\u0300", "\\u00C0\\u0316" ,
1440 0 // end
1441 };
1442
1443 const char* COMPAT[] = {
1444 // Input Decomposed Composed
1445 "\\uFB4f", "\\u05D0\\u05DC", "\\u05D0\\u05DC" , // Alef-Lamed vs. Alef, Lamed
1446
1447 "\\u00fdffin", "y\\u0301ffin", "\\u00fdffin" , //updated for 3.0
1448 "\\u00fd\\uFB03n", "y\\u0301ffin", "\\u00fdffin" , // ffi ligature -> f + f + i
1449
1450 "Henry IV", "Henry IV", "Henry IV" ,
1451 "Henry \\u2163", "Henry IV", "Henry IV" ,
1452
1453 "\\u30AC", "\\u30AB\\u3099", "\\u30AC" , // ga (Katakana)
1454 "\\u30AB\\u3099", "\\u30AB\\u3099", "\\u30AC" , // ka + ten
1455
1456 "\\uFF76\\u3099", "\\u30AB\\u3099", "\\u30AC" , // hw_ka + ten
1457 0 // end
1458 };
1459
1460 int32_t i;
1461 UParseError parseError;
1462 UErrorCode status = U_ZERO_ERROR;
1463 Transliterator* NFD = Transliterator::createInstance("NFD", UTRANS_FORWARD, parseError, status);
1464 Transliterator* NFC = Transliterator::createInstance("NFC", UTRANS_FORWARD, parseError, status);
1465 if (!NFD || !NFC) {
1466 dataerrln("FAIL: createInstance failed: %s", u_errorName(status));
1467 delete NFD;
1468 delete NFC;
1469 return;
1470 }
1471 for (i=0; CANON[i]; i+=3) {
1472 UnicodeString in = CharsToUnicodeString(CANON[i]);
1473 UnicodeString expd = CharsToUnicodeString(CANON[i+1]);
1474 UnicodeString expc = CharsToUnicodeString(CANON[i+2]);
1475 expect(*NFD, in, expd);
1476 expect(*NFC, in, expc);
1477 }
1478 delete NFD;
1479 delete NFC;
1480
1481 Transliterator* NFKD = Transliterator::createInstance("NFKD", UTRANS_FORWARD, parseError, status);
1482 Transliterator* NFKC = Transliterator::createInstance("NFKC", UTRANS_FORWARD, parseError, status);
1483 if (!NFKD || !NFKC) {
1484 dataerrln("FAIL: createInstance failed");
1485 delete NFKD;
1486 delete NFKC;
1487 return;
1488 }
1489 for (i=0; COMPAT[i]; i+=3) {
1490 UnicodeString in = CharsToUnicodeString(COMPAT[i]);
1491 UnicodeString expkd = CharsToUnicodeString(COMPAT[i+1]);
1492 UnicodeString expkc = CharsToUnicodeString(COMPAT[i+2]);
1493 expect(*NFKD, in, expkd);
1494 expect(*NFKC, in, expkc);
1495 }
1496 delete NFKD;
1497 delete NFKC;
1498
1499 UParseError pe;
1500 status = U_ZERO_ERROR;
1501 Transliterator *t = Transliterator::createInstance("NFD; [x]Remove",
1502 UTRANS_FORWARD,
1503 pe, status);
1504 if (t == 0) {
1505 errln("FAIL: createInstance failed");
1506 }
1507 expect(*t, CharsToUnicodeString("\\u010dx"),
1508 CharsToUnicodeString("c\\u030C"));
1509 delete t;
1510 }
1511
1512 /**
1513 * Test we can create basic transliterator even without data.
1514 */
TestBasicTransliteratorEvenWithoutData()1515 void TransliteratorTest::TestBasicTransliteratorEvenWithoutData() {
1516 const char16_t* TEST_DATA = u"\u0124e\u0301 \uFB01nd x";
1517 const char16_t* EXPECTED_RESULTS[] = {
1518 u"H\u0302e\u0301 \uFB01nd x", // NFD
1519 u"\u0124\u00E9 \uFB01nd x", // NFC
1520 u"H\u0302e\u0301 find x", // NFKD
1521 u"\u0124\u00E9 find x", // NFKC
1522 u"\u0124e\u0301 \uFB01nd x", // Hex-Any
1523 u"\u0125e\u0301 \uFB01nd x", // Lower
1524 u"\u0124e\uFB01ndx", // [:^L:]Remove
1525 u"H\u0302e\u0301 \uFB01nd ", // NFD; [x]Remove
1526 u"h\u0302e\u0301 find x", // Lower; NFKD;
1527 u"hefindx", // Lower; NFKD; [:^L:]Remove; NFC;
1528 u"\u0124e \uFB01nd x", // [:Nonspacing Mark:] Remove;
1529 u"He \uFB01nd x", // NFD; [:Nonspacing Mark:] Remove; NFC;
1530 // end
1531 0
1532 };
1533
1534 const char* BASIC_TRANSLITERATOR_ID[] = {
1535 "NFD",
1536 "NFC",
1537 "NFKD",
1538 "NFKC",
1539 "Hex-Any",
1540 "Lower",
1541 "[:^L:]Remove",
1542 "NFD; [x]Remove",
1543 "Lower; NFKD;",
1544 "Lower; NFKD; [:^L:]Remove; NFC;",
1545 "[:Nonspacing Mark:] Remove;",
1546 "NFD; [:Nonspacing Mark:] Remove; NFC;",
1547 // end
1548 0
1549 };
1550 const char* BASIC_TRANSLITERATOR_RULES[] = {
1551 "::Lower; ::NFKD;",
1552 "::Lower; ::NFKD; ::[:^L:]Remove; ::NFC;",
1553 "::[:Nonspacing Mark:] Remove;",
1554 "::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;",
1555 // end
1556 0
1557 };
1558 for (int32_t i=0; BASIC_TRANSLITERATOR_ID[i]; i++) {
1559 UErrorCode status = U_ZERO_ERROR;
1560 UParseError parseError;
1561 std::unique_ptr<Transliterator> translit(Transliterator::createInstance(
1562 BASIC_TRANSLITERATOR_ID[i], UTRANS_FORWARD, parseError, status));
1563 if (translit.get() == nullptr || !U_SUCCESS(status)) {
1564 dataerrln("FAIL: createInstance %s failed", BASIC_TRANSLITERATOR_ID[i]);
1565 continue;
1566 }
1567 UnicodeString data(TEST_DATA);
1568 UnicodeString expected(EXPECTED_RESULTS[i]);
1569 translit->transliterate(data);
1570 if (data != expected) {
1571 dataerrln(UnicodeString("FAIL: expected translit(") +
1572 BASIC_TRANSLITERATOR_ID[i] + ") = '" +
1573 EXPECTED_RESULTS[i] + "' but got '" + data);
1574 continue;
1575 }
1576 }
1577 for (int32_t i=0; BASIC_TRANSLITERATOR_RULES[i]; i++) {
1578 UErrorCode status = U_ZERO_ERROR;
1579 UParseError parseError;
1580 std::unique_ptr<Transliterator> translit(Transliterator::createFromRules(
1581 "Test",
1582 BASIC_TRANSLITERATOR_RULES[i], UTRANS_FORWARD, parseError, status));
1583 if (translit.get() == nullptr || !U_SUCCESS(status)) {
1584 dataerrln("FAIL: createFromRules %s failed", BASIC_TRANSLITERATOR_RULES[i]);
1585 continue;
1586 }
1587 }
1588 }
1589
1590 /**
1591 * Test compound RBT rules.
1592 */
TestCompoundRBT(void)1593 void TransliteratorTest::TestCompoundRBT(void) {
1594 // Careful with spacing and ';' here: Phrase this exactly
1595 // as toRules() is going to return it. If toRules() changes
1596 // with regard to spacing or ';', then adjust this string.
1597 UnicodeString rule("::Hex-Any;\n"
1598 "::Any-Lower;\n"
1599 "a > '.A.';\n"
1600 "b > '.B.';\n"
1601 "::[^t]Any-Upper;", "");
1602 UParseError parseError;
1603 UErrorCode status = U_ZERO_ERROR;
1604 Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, parseError, status);
1605 if (t == 0) {
1606 errln("FAIL: createFromRules failed");
1607 return;
1608 }
1609 expect(*t, UNICODE_STRING_SIMPLE("\\u0043at in the hat, bat on the mat"),
1610 "C.A.t IN tHE H.A.t, .B..A.t ON tHE M.A.t");
1611 UnicodeString r;
1612 t->toRules(r, true);
1613 if (r == rule) {
1614 logln((UnicodeString)"OK: toRules() => " + r);
1615 } else {
1616 errln((UnicodeString)"FAIL: toRules() => " + r +
1617 ", expected " + rule);
1618 }
1619 delete t;
1620
1621 // Now test toRules
1622 t = Transliterator::createInstance("Greek-Latin; Latin-Cyrillic", UTRANS_FORWARD, parseError, status);
1623 if (t == 0) {
1624 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1625 return;
1626 }
1627 UnicodeString exp("::Greek-Latin;\n::Latin-Cyrillic;");
1628 t->toRules(r, true);
1629 if (r != exp) {
1630 errln((UnicodeString)"FAIL: toRules() => " + r +
1631 ", expected " + exp);
1632 } else {
1633 logln((UnicodeString)"OK: toRules() => " + r);
1634 }
1635 delete t;
1636
1637 // Round trip the result of toRules
1638 t = Transliterator::createFromRules("Test", r, UTRANS_FORWARD, parseError, status);
1639 if (t == 0) {
1640 errln("FAIL: createFromRules #2 failed");
1641 return;
1642 } else {
1643 logln((UnicodeString)"OK: createFromRules(" + r + ") succeeded");
1644 }
1645
1646 // Test toRules again
1647 t->toRules(r, true);
1648 if (r != exp) {
1649 errln((UnicodeString)"FAIL: toRules() => " + r +
1650 ", expected " + exp);
1651 } else {
1652 logln((UnicodeString)"OK: toRules() => " + r);
1653 }
1654
1655 delete t;
1656
1657 // Test Foo(Bar) IDs. Careful with spacing in id; make it conform
1658 // to what the regenerated ID will look like.
1659 UnicodeString id("Upper(Lower);(NFKC)", "");
1660 t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
1661 if (t == 0) {
1662 errln("FAIL: createInstance #2 failed");
1663 return;
1664 }
1665 if (t->getID() == id) {
1666 logln((UnicodeString)"OK: created " + id);
1667 } else {
1668 errln((UnicodeString)"FAIL: createInstance(" + id +
1669 ").getID() => " + t->getID());
1670 }
1671
1672 Transliterator *u = t->createInverse(status);
1673 if (u == 0) {
1674 errln("FAIL: createInverse failed");
1675 delete t;
1676 return;
1677 }
1678 exp = "NFKC();Lower(Upper)";
1679 if (u->getID() == exp) {
1680 logln((UnicodeString)"OK: createInverse(" + id + ") => " +
1681 u->getID());
1682 } else {
1683 errln((UnicodeString)"FAIL: createInverse(" + id + ") => " +
1684 u->getID());
1685 }
1686 delete t;
1687 delete u;
1688 }
1689
1690 /**
1691 * Compound filter semantics were originally not implemented
1692 * correctly. Originally, each component filter f(i) is replaced by
1693 * f'(i) = f(i) && g, where g is the filter for the compound
1694 * transliterator.
1695 *
1696 * From Mark:
1697 *
1698 * Suppose and I have a transliterator X. Internally X is
1699 * "Greek-Latin; Latin-Cyrillic; Any-Lower". I use a filter [^A].
1700 *
1701 * The compound should convert all greek characters (through latin) to
1702 * cyrillic, then lowercase the result. The filter should say "don't
1703 * touch 'A' in the original". But because an intermediate result
1704 * happens to go through "A", the Greek Alpha gets hung up.
1705 */
TestCompoundFilter(void)1706 void TransliteratorTest::TestCompoundFilter(void) {
1707 UParseError parseError;
1708 UErrorCode status = U_ZERO_ERROR;
1709 Transliterator *t = Transliterator::createInstance
1710 ("Greek-Latin; Latin-Greek; Lower", UTRANS_FORWARD, parseError, status);
1711 if (t == 0) {
1712 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1713 return;
1714 }
1715 t->adoptFilter(new UnicodeSet("[^A]", status));
1716 if (U_FAILURE(status)) {
1717 errln("FAIL: UnicodeSet ct failed");
1718 delete t;
1719 return;
1720 }
1721
1722 // Only the 'A' at index 1 should remain unchanged
1723 expect(*t,
1724 CharsToUnicodeString("BA\\u039A\\u0391"),
1725 CharsToUnicodeString("\\u03b2A\\u03ba\\u03b1"));
1726 delete t;
1727 }
1728
TestRemove(void)1729 void TransliteratorTest::TestRemove(void) {
1730 UParseError parseError;
1731 UErrorCode status = U_ZERO_ERROR;
1732 Transliterator *t = Transliterator::createInstance("Remove[abc]", UTRANS_FORWARD, parseError, status);
1733 if (t == 0) {
1734 errln("FAIL: createInstance failed");
1735 return;
1736 }
1737
1738 expect(*t, "Able bodied baker's cats", "Ale odied ker's ts");
1739
1740 // extra test for RemoveTransliterator::clone(), which at one point wasn't
1741 // duplicating the filter
1742 Transliterator* t2 = t->clone();
1743 expect(*t2, "Able bodied baker's cats", "Ale odied ker's ts");
1744
1745 delete t;
1746 delete t2;
1747 }
1748
TestToRules(void)1749 void TransliteratorTest::TestToRules(void) {
1750 const char* RBT = "rbt";
1751 const char* SET = "set";
1752 static const char* DATA[] = {
1753 RBT,
1754 "$a=\\u4E61; [$a] > A;",
1755 "[\\u4E61] > A;",
1756
1757 RBT,
1758 "$white=[[:Zs:][:Zl:]]; $white{a} > A;",
1759 "[[:Zs:][:Zl:]]{a} > A;",
1760
1761 SET,
1762 "[[:Zs:][:Zl:]]",
1763 "[[:Zs:][:Zl:]]",
1764
1765 SET,
1766 "[:Ps:]",
1767 "[:Ps:]",
1768
1769 SET,
1770 "[:L:]",
1771 "[:L:]",
1772
1773 SET,
1774 "[[:L:]-[A]]",
1775 "[[:L:]-[A]]",
1776
1777 SET,
1778 "[~[:Lu:][:Ll:]]",
1779 "[~[:Lu:][:Ll:]]",
1780
1781 SET,
1782 "[~[a-z]]",
1783 "[~[a-z]]",
1784
1785 RBT,
1786 "$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
1787 "[^[:Zs:]]{a} > A;",
1788
1789 RBT,
1790 "$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
1791 "[[a-z]-[:Zs:]]{a} > A;",
1792
1793 RBT,
1794 "$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
1795 "[[:Zs:]&[a-z]]{a} > A;",
1796
1797 RBT,
1798 "$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
1799 "[x[:Zs:]]{a} > A;",
1800
1801 RBT,
1802 "$accentMinus = [ [\\u0300-\\u0345] & [:M:] - [\\u0338]] ;"
1803 "$macron = \\u0304 ;"
1804 "$evowel = [aeiouyAEIOUY] ;"
1805 "$iotasub = \\u0345 ;"
1806 "($evowel $macron $accentMinus *) i > | $1 $iotasub ;",
1807 "([AEIOUYaeiouy]\\u0304[[\\u0300-\\u0345]&[:M:]-[\\u0338]]*)i > | $1 \\u0345;",
1808
1809 RBT,
1810 "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1811 "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1812 };
1813 static const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1814
1815 for (int32_t d=0; d < DATA_length; d+=3) {
1816 if (DATA[d] == RBT) {
1817 // Transliterator test
1818 UParseError parseError;
1819 UErrorCode status = U_ZERO_ERROR;
1820 Transliterator *t = Transliterator::createFromRules("ID",
1821 UnicodeString(DATA[d+1], -1, US_INV), UTRANS_FORWARD, parseError, status);
1822 if (t == 0) {
1823 dataerrln("FAIL: createFromRules failed - %s", u_errorName(status));
1824 return;
1825 }
1826 UnicodeString rules, escapedRules;
1827 t->toRules(rules, false);
1828 t->toRules(escapedRules, true);
1829 UnicodeString expRules = CharsToUnicodeString(DATA[d+2]);
1830 UnicodeString expEscapedRules(DATA[d+2], -1, US_INV);
1831 if (rules == expRules) {
1832 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1833 " => " + rules);
1834 } else {
1835 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1836 " => " + rules + ", exp " + expRules);
1837 }
1838 if (escapedRules == expEscapedRules) {
1839 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1840 " => " + escapedRules);
1841 } else {
1842 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1843 " => " + escapedRules + ", exp " + expEscapedRules);
1844 }
1845 delete t;
1846
1847 } else {
1848 // UnicodeSet test
1849 UErrorCode status = U_ZERO_ERROR;
1850 UnicodeString pat(DATA[d+1], -1, US_INV);
1851 UnicodeString expToPat(DATA[d+2], -1, US_INV);
1852 UnicodeSet set(pat, status);
1853 if (U_FAILURE(status)) {
1854 errln("FAIL: UnicodeSet ct failed");
1855 return;
1856 }
1857 // Adjust spacing etc. as necessary.
1858 UnicodeString toPat;
1859 set.toPattern(toPat);
1860 if (expToPat == toPat) {
1861 logln((UnicodeString)"Ok: " + pat +
1862 " => " + toPat);
1863 } else {
1864 errln((UnicodeString)"FAIL: " + pat +
1865 " => " + prettify(toPat, true) +
1866 ", exp " + prettify(pat, true));
1867 }
1868 }
1869 }
1870 }
1871
TestContext()1872 void TransliteratorTest::TestContext() {
1873 UTransPosition pos = {0, 2, 0, 1}; // cs cl s l
1874 expect("de > x; {d}e > y;",
1875 "de",
1876 "ye",
1877 &pos);
1878
1879 expect("ab{c} > z;",
1880 "xadabdabcy",
1881 "xadabdabzy");
1882 }
1883
TestSupplemental()1884 void TransliteratorTest::TestSupplemental() {
1885
1886 expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];"
1887 "a > $a; $s > i;"),
1888 CharsToUnicodeString("ab\\U0001030Fx"),
1889 CharsToUnicodeString("\\U00010300bix"));
1890
1891 expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];"
1892 "$b=[A-Z\\U00010400-\\U0001044D];"
1893 "($a)($b) > $2 $1;"),
1894 CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
1895 CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
1896
1897 // k|ax\\U00010300xm
1898
1899 // k|a\\U00010400\\U00010300xm
1900 // ky|\\U00010400\\U00010300xm
1901 // ky\\U00010400|\\U00010300xm
1902
1903 // ky\\U00010400|\\U00010300\\U00010400m
1904 // ky\\U00010400y|\\U00010400m
1905 expect(CharsToUnicodeString("$a=[a\\U00010300-\\U00010323];"
1906 "$a {x} > | @ \\U00010400;"
1907 "{$a} [^\\u0000-\\uFFFF] > y;"),
1908 CharsToUnicodeString("kax\\U00010300xm"),
1909 CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
1910
1911 expectT("Any-Name",
1912 CharsToUnicodeString("\\U00010330\\U000E0061\\u00A0"),
1913 UNICODE_STRING_SIMPLE("\\N{GOTHIC LETTER AHSA}\\N{TAG LATIN SMALL LETTER A}\\N{NO-BREAK SPACE}"));
1914
1915 expectT("Any-Hex/Unicode",
1916 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1917 UNICODE_STRING_SIMPLE("U+10330U+10FF00U+E0061U+00A0"));
1918
1919 expectT("Any-Hex/C",
1920 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1921 UNICODE_STRING_SIMPLE("\\U00010330\\U0010FF00\\U000E0061\\u00A0"));
1922
1923 expectT("Any-Hex/Perl",
1924 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1925 UNICODE_STRING_SIMPLE("\\x{10330}\\x{10FF00}\\x{E0061}\\x{A0}"));
1926
1927 expectT("Any-Hex/Java",
1928 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1929 UNICODE_STRING_SIMPLE("\\uD800\\uDF30\\uDBFF\\uDF00\\uDB40\\uDC61\\u00A0"));
1930
1931 expectT("Any-Hex/XML",
1932 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1933 "𐌰􏼀󠁡 ");
1934
1935 expectT("Any-Hex/XML10",
1936 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1937 "𐌰􏼀󠁡 ");
1938
1939 expectT(UNICODE_STRING_SIMPLE("[\\U000E0000-\\U000E0FFF] Remove"),
1940 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1941 CharsToUnicodeString("\\U00010330\\U0010FF00\\u00A0"));
1942 }
1943
TestQuantifier()1944 void TransliteratorTest::TestQuantifier() {
1945
1946 // Make sure @ in a quantified anteContext works
1947 expect("a+ {b} > | @@ c; A > a; (a+ c) > '(' $1 ')';",
1948 "AAAAAb",
1949 "aaa(aac)");
1950
1951 // Make sure @ in a quantified postContext works
1952 expect("{b} a+ > c @@ |; (a+) > '(' $1 ')';",
1953 "baaaaa",
1954 "caa(aaa)");
1955
1956 // Make sure @ in a quantified postContext with seg ref works
1957 expect("{(b)} a+ > $1 @@ |; (a+) > '(' $1 ')';",
1958 "baaaaa",
1959 "baa(aaa)");
1960
1961 // Make sure @ past ante context doesn't enter ante context
1962 UTransPosition pos = {0, 5, 3, 5};
1963 expect("a+ {b} > | @@ c; x > y; (a+ c) > '(' $1 ')';",
1964 "xxxab",
1965 "xxx(ac)",
1966 &pos);
1967
1968 // Make sure @ past post context doesn't pass limit
1969 UTransPosition pos2 = {0, 4, 0, 2};
1970 expect("{b} a+ > c @@ |; x > y; a > A;",
1971 "baxx",
1972 "caxx",
1973 &pos2);
1974
1975 // Make sure @ past post context doesn't enter post context
1976 expect("{b} a+ > c @@ |; x > y; a > A;",
1977 "baxx",
1978 "cayy");
1979
1980 expect("(ab)? c > d;",
1981 "c abc ababc",
1982 "d d abd");
1983
1984 // NOTE: The (ab)+ when referenced just yields a single "ab",
1985 // not the full sequence of them. This accords with perl behavior.
1986 expect("(ab)+ {x} > '(' $1 ')';",
1987 "x abx ababxy",
1988 "x ab(ab) abab(ab)y");
1989
1990 expect("b+ > x;",
1991 "ac abc abbc abbbc",
1992 "ac axc axc axc");
1993
1994 expect("[abc]+ > x;",
1995 "qac abrc abbcs abtbbc",
1996 "qx xrx xs xtx");
1997
1998 expect("q{(ab)+} > x;",
1999 "qa qab qaba qababc qaba",
2000 "qa qx qxa qxc qxa");
2001
2002 expect("q(ab)* > x;",
2003 "qa qab qaba qababc",
2004 "xa x xa xc");
2005
2006 // NOTE: The (ab)+ when referenced just yields a single "ab",
2007 // not the full sequence of them. This accords with perl behavior.
2008 expect("q(ab)* > '(' $1 ')';",
2009 "qa qab qaba qababc",
2010 "()a (ab) (ab)a (ab)c");
2011
2012 // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
2013 // quoted string
2014 expect("'ab'+ > x;",
2015 "bb ab ababb",
2016 "bb x xb");
2017
2018 // $foo+ and $foo* -- the quantifier should apply to the entire
2019 // variable reference
2020 expect("$var = ab; $var+ > x;",
2021 "bb ab ababb",
2022 "bb x xb");
2023 }
2024
2025 class TestTrans : public Transliterator {
2026 public:
TestTrans(const UnicodeString & id)2027 TestTrans(const UnicodeString& id) : Transliterator(id, 0) {
2028 }
clone(void) const2029 virtual TestTrans* clone(void) const override {
2030 return new TestTrans(getID());
2031 }
handleTransliterate(Replaceable &,UTransPosition & offsets,UBool) const2032 virtual void handleTransliterate(Replaceable& /*text*/, UTransPosition& offsets,
2033 UBool /*isIncremental*/) const override
2034 {
2035 offsets.start = offsets.limit;
2036 }
2037 virtual UClassID getDynamicClassID() const override;
2038 static UClassID U_EXPORT2 getStaticClassID();
2039 };
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)2040 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)
2041
2042 /**
2043 * Test Source-Target/Variant.
2044 */
2045 void TransliteratorTest::TestSTV(void) {
2046 int32_t ns = Transliterator::countAvailableSources();
2047 if (ns < 0 || ns > 255) {
2048 errln((UnicodeString)"FAIL: Bad source count: " + ns);
2049 return;
2050 }
2051 int32_t i, j;
2052 for (i=0; i<ns; ++i) {
2053 UnicodeString source;
2054 Transliterator::getAvailableSource(i, source);
2055 logln((UnicodeString)"" + i + ": " + source);
2056 if (source.length() == 0) {
2057 errln("FAIL: empty source");
2058 continue;
2059 }
2060 int32_t nt = Transliterator::countAvailableTargets(source);
2061 if (nt < 0 || nt > 255) {
2062 errln((UnicodeString)"FAIL: Bad target count: " + nt);
2063 continue;
2064 }
2065 for (int32_t j=0; j<nt; ++j) {
2066 UnicodeString target;
2067 Transliterator::getAvailableTarget(j, source, target);
2068 logln((UnicodeString)" " + j + ": " + target);
2069 if (target.length() == 0) {
2070 errln("FAIL: empty target");
2071 continue;
2072 }
2073 int32_t nv = Transliterator::countAvailableVariants(source, target);
2074 if (nv < 0 || nv > 255) {
2075 errln((UnicodeString)"FAIL: Bad variant count: " + nv);
2076 continue;
2077 }
2078 for (int32_t k=0; k<nv; ++k) {
2079 UnicodeString variant;
2080 Transliterator::getAvailableVariant(k, source, target, variant);
2081 if (variant.length() == 0) {
2082 logln((UnicodeString)" " + k + ": <empty>");
2083 } else {
2084 logln((UnicodeString)" " + k + ": " + variant);
2085 }
2086 }
2087 }
2088 }
2089
2090 // Test registration
2091 const char* IDS[] = { "Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2092 const char* FULL_IDS[] = { "Any-Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2093 const char* SOURCES[] = { NULL, "Seoridf", "Oewoir" };
2094 for (i=0; i<3; ++i) {
2095 Transliterator *t = new TestTrans(IDS[i]);
2096 if (t == 0) {
2097 errln("FAIL: out of memory");
2098 return;
2099 }
2100 if (t->getID() != IDS[i]) {
2101 errln((UnicodeString)"FAIL: ID mismatch for " + IDS[i]);
2102 delete t;
2103 return;
2104 }
2105 Transliterator::registerInstance(t);
2106 UErrorCode status = U_ZERO_ERROR;
2107 t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2108 if (t == NULL) {
2109 errln((UnicodeString)"FAIL: Registration/creation failed for ID " +
2110 IDS[i]);
2111 } else {
2112 logln((UnicodeString)"Ok: Registration/creation succeeded for ID " +
2113 IDS[i]);
2114 delete t;
2115 }
2116 Transliterator::unregister(IDS[i]);
2117 t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2118 if (t != NULL) {
2119 errln((UnicodeString)"FAIL: Unregistration failed for ID " +
2120 IDS[i]);
2121 delete t;
2122 }
2123 }
2124
2125 // Make sure getAvailable API reflects removal
2126 int32_t n = Transliterator::countAvailableIDs();
2127 for (i=0; i<n; ++i) {
2128 UnicodeString id = Transliterator::getAvailableID(i);
2129 for (j=0; j<3; ++j) {
2130 if (id.caseCompare(FULL_IDS[j],0)==0) {
2131 errln((UnicodeString)"FAIL: unregister(" + id + ") failed");
2132 }
2133 }
2134 }
2135 n = Transliterator::countAvailableTargets("Any");
2136 for (i=0; i<n; ++i) {
2137 UnicodeString t;
2138 Transliterator::getAvailableTarget(i, "Any", t);
2139 if (t.caseCompare(IDS[0],0)==0) {
2140 errln((UnicodeString)"FAIL: unregister(Any-" + t + ") failed");
2141 }
2142 }
2143 n = Transliterator::countAvailableSources();
2144 for (i=0; i<n; ++i) {
2145 UnicodeString s;
2146 Transliterator::getAvailableSource(i, s);
2147 for (j=0; j<3; ++j) {
2148 if (SOURCES[j] == NULL) continue;
2149 if (s.caseCompare(SOURCES[j],0)==0) {
2150 errln((UnicodeString)"FAIL: unregister(" + s + "-*) failed");
2151 }
2152 }
2153 }
2154 }
2155
2156 /**
2157 * Test inverse of Greek-Latin; Title()
2158 */
TestCompoundInverse(void)2159 void TransliteratorTest::TestCompoundInverse(void) {
2160 UParseError parseError;
2161 UErrorCode status = U_ZERO_ERROR;
2162 Transliterator *t = Transliterator::createInstance
2163 ("Greek-Latin; Title()", UTRANS_REVERSE,parseError, status);
2164 if (t == 0) {
2165 dataerrln("FAIL: createInstance - %s", u_errorName(status));
2166 return;
2167 }
2168 UnicodeString exp("(Title);Latin-Greek");
2169 if (t->getID() == exp) {
2170 logln("Ok: inverse of \"Greek-Latin; Title()\" is \"" +
2171 t->getID());
2172 } else {
2173 errln("FAIL: inverse of \"Greek-Latin; Title()\" is \"" +
2174 t->getID() + "\", expected \"" + exp + "\"");
2175 }
2176 delete t;
2177 }
2178
2179 /**
2180 * Test NFD chaining with RBT
2181 */
TestNFDChainRBT()2182 void TransliteratorTest::TestNFDChainRBT() {
2183 UParseError pe;
2184 UErrorCode ec = U_ZERO_ERROR;
2185 Transliterator* t = Transliterator::createFromRules(
2186 "TEST", "::NFD; aa > Q; a > q;",
2187 UTRANS_FORWARD, pe, ec);
2188 if (t == NULL || U_FAILURE(ec)) {
2189 dataerrln("FAIL: Transliterator::createFromRules failed with %s", u_errorName(ec));
2190 return;
2191 }
2192 expect(*t, "aa", "Q");
2193 delete t;
2194
2195 // TEMPORARY TESTS -- BEING DEBUGGED
2196 //=- UnicodeString s, s2;
2197 //=- t = Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, pe, ec);
2198 //=- s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2199 //=- s2 = CharsToUnicodeString("\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D");
2200 //=- expect(*t, s, s2);
2201 //=- delete t;
2202 //=-
2203 //=- t = Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2204 //=- expect(*t, s2, s);
2205 //=- delete t;
2206 //=-
2207 //=- t = Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2208 //=- s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2209 //=- expect(*t, s, s);
2210 //=- delete t;
2211
2212 // const char* source[] = {
2213 // /*
2214 // "\\u015Br\\u012Bmad",
2215 // "bhagavadg\\u012Bt\\u0101",
2216 // "adhy\\u0101ya",
2217 // "arjuna",
2218 // "vi\\u1E63\\u0101da",
2219 // "y\\u014Dga",
2220 // "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2221 // "uv\\u0101cr\\u0325",
2222 // */
2223 // "rmk\\u1E63\\u0113t",
2224 // //"dharmak\\u1E63\\u0113tr\\u0113",
2225 // /*
2226 // "kuruk\\u1E63\\u0113tr\\u0113",
2227 // "samav\\u0113t\\u0101",
2228 // "yuyutsava-\\u1E25",
2229 // "m\\u0101mak\\u0101-\\u1E25",
2230 // // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2231 // "kimakurvata",
2232 // "san\\u0304java",
2233 // */
2234 //
2235 // 0
2236 // };
2237 // const char* expected[] = {
2238 // /*
2239 // "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2240 // "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2241 // "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2242 // "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2243 // "\\u0935\\u093f\\u0937\\u093e\\u0926",
2244 // "\\u092f\\u094b\\u0917",
2245 // "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2246 // "\\u0909\\u0935\\u093E\\u091A\\u0943",
2247 // */
2248 // "\\u0927",
2249 // //"\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2250 // /*
2251 // "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2252 // "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2253 // "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2254 // "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2255 // // "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2256 // "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2257 // "\\u0938\\u0902\\u091c\\u0935",
2258 // */
2259 // 0
2260 // };
2261 // UErrorCode status = U_ZERO_ERROR;
2262 // UParseError parseError;
2263 // UnicodeString message;
2264 // Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2265 // Transliterator* devToLatinToDev=Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2266 // if(U_FAILURE(status)){
2267 // errln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2268 // errln("PreContext: " + prettify(parseError.preContext) + "PostContext: " + prettify( parseError.postContext) );
2269 // delete latinToDevToLatin;
2270 // delete devToLatinToDev;
2271 // return;
2272 // }
2273 // UnicodeString gotResult;
2274 // for(int i= 0; source[i] != 0; i++){
2275 // gotResult = source[i];
2276 // expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2277 // expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2278 // }
2279 // delete latinToDevToLatin;
2280 // delete devToLatinToDev;
2281 }
2282
2283 /**
2284 * Inverse of "Null" should be "Null". (J21)
2285 */
TestNullInverse()2286 void TransliteratorTest::TestNullInverse() {
2287 UParseError pe;
2288 UErrorCode ec = U_ZERO_ERROR;
2289 Transliterator *t = Transliterator::createInstance("Null", UTRANS_FORWARD, pe, ec);
2290 if (t == 0 || U_FAILURE(ec)) {
2291 errln("FAIL: createInstance");
2292 return;
2293 }
2294 Transliterator *u = t->createInverse(ec);
2295 if (u == 0 || U_FAILURE(ec)) {
2296 errln("FAIL: createInverse");
2297 delete t;
2298 return;
2299 }
2300 if (u->getID() != "Null") {
2301 errln("FAIL: Inverse of Null should be Null");
2302 }
2303 delete t;
2304 delete u;
2305 }
2306
2307 /**
2308 * Check ID of inverse of alias. (J22)
2309 */
TestAliasInverseID()2310 void TransliteratorTest::TestAliasInverseID() {
2311 UnicodeString ID("Latin-Hangul", ""); // This should be any alias ID with an inverse
2312 UParseError pe;
2313 UErrorCode ec = U_ZERO_ERROR;
2314 Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2315 if (t == 0 || U_FAILURE(ec)) {
2316 dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2317 return;
2318 }
2319 Transliterator *u = t->createInverse(ec);
2320 if (u == 0 || U_FAILURE(ec)) {
2321 errln("FAIL: createInverse");
2322 delete t;
2323 return;
2324 }
2325 UnicodeString exp = "Hangul-Latin";
2326 UnicodeString got = u->getID();
2327 if (got != exp) {
2328 errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2329 ", expected " + exp);
2330 }
2331 delete t;
2332 delete u;
2333 }
2334
2335 /**
2336 * Test IDs of inverses of compound transliterators. (J20)
2337 */
TestCompoundInverseID()2338 void TransliteratorTest::TestCompoundInverseID() {
2339 UnicodeString ID = "Latin-Jamo;NFC(NFD)";
2340 UParseError pe;
2341 UErrorCode ec = U_ZERO_ERROR;
2342 Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2343 if (t == 0 || U_FAILURE(ec)) {
2344 dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2345 return;
2346 }
2347 Transliterator *u = t->createInverse(ec);
2348 if (u == 0 || U_FAILURE(ec)) {
2349 errln("FAIL: createInverse");
2350 delete t;
2351 return;
2352 }
2353 UnicodeString exp = "NFD(NFC);Jamo-Latin";
2354 UnicodeString got = u->getID();
2355 if (got != exp) {
2356 errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2357 ", expected " + exp);
2358 }
2359 delete t;
2360 delete u;
2361 }
2362
2363 /**
2364 * Test undefined variable.
2365
2366 */
TestUndefinedVariable()2367 void TransliteratorTest::TestUndefinedVariable() {
2368 UnicodeString rule = "$initial } a <> \\u1161;";
2369 UParseError pe;
2370 UErrorCode ec = U_ZERO_ERROR;
2371 Transliterator *t = Transliterator::createFromRules("<ID>", rule, UTRANS_FORWARD, pe, ec);
2372 delete t;
2373 if (U_FAILURE(ec)) {
2374 logln((UnicodeString)"OK: Got exception for " + rule + ", as expected: " +
2375 u_errorName(ec));
2376 return;
2377 }
2378 errln((UnicodeString)"Fail: bogus rule " + rule + " compiled with error " +
2379 u_errorName(ec));
2380 }
2381
2382 /**
2383 * Test empty context.
2384 */
TestEmptyContext()2385 void TransliteratorTest::TestEmptyContext() {
2386 expect(" { a } > b;", "xay a ", "xby b ");
2387 }
2388
2389 /**
2390 * Test compound filter ID syntax
2391 */
TestCompoundFilterID(void)2392 void TransliteratorTest::TestCompoundFilterID(void) {
2393 static const char* DATA[] = {
2394 // Col. 1 = ID or rule set (latter must start with #)
2395
2396 // = columns > 1 are null if expect col. 1 to be illegal =
2397
2398 // Col. 2 = direction, "F..." or "R..."
2399 // Col. 3 = source string
2400 // Col. 4 = exp result
2401
2402 "[abc]; [abc]", NULL, NULL, NULL, // multiple filters
2403 "Latin-Greek; [abc];", NULL, NULL, NULL, // misplaced filter
2404 "[b]; Latin-Greek; Upper; ([xyz])", "F", "abc", "a\\u0392c",
2405 "[b]; (Lower); Latin-Greek; Upper(); ([\\u0392])", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2406 "#\n::[b]; ::Latin-Greek; ::Upper; ::([xyz]);", "F", "abc", "a\\u0392c",
2407 "#\n::[b]; ::(Lower); ::Latin-Greek; ::Upper(); ::([\\u0392]);", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2408 NULL,
2409 };
2410
2411 for (int32_t i=0; DATA[i]; i+=4) {
2412 UnicodeString id = CharsToUnicodeString(DATA[i]);
2413 UTransDirection direction = (DATA[i+1] != NULL && DATA[i+1][0] == 'R') ?
2414 UTRANS_REVERSE : UTRANS_FORWARD;
2415 UnicodeString source;
2416 UnicodeString exp;
2417 if (DATA[i+2] != NULL) {
2418 source = CharsToUnicodeString(DATA[i+2]);
2419 exp = CharsToUnicodeString(DATA[i+3]);
2420 }
2421 UBool expOk = (DATA[i+1] != NULL);
2422 LocalPointer<Transliterator> t;
2423 UParseError pe;
2424 UErrorCode ec = U_ZERO_ERROR;
2425 if (id.charAt(0) == 0x23/*#*/) {
2426 t.adoptInstead(Transliterator::createFromRules("ID", id, direction, pe, ec));
2427 } else {
2428 t.adoptInstead(Transliterator::createInstance(id, direction, pe, ec));
2429 }
2430 UBool ok = (t.isValid() && U_SUCCESS(ec));
2431 UnicodeString transID;
2432 if (t.isValid()) {
2433 transID = t->getID();
2434 }
2435 else {
2436 transID = UnicodeString("NULL", "");
2437 }
2438 if (ok == expOk) {
2439 logln((UnicodeString)"Ok: " + id + " => " + transID + ", " +
2440 u_errorName(ec));
2441 if (source.length() != 0) {
2442 expect(*t, source, exp);
2443 }
2444 } else {
2445 dataerrln((UnicodeString)"FAIL: " + id + " => " + transID + ", " +
2446 u_errorName(ec));
2447 }
2448 }
2449 }
2450
2451 /**
2452 * Test new property set syntax
2453 */
TestPropertySet()2454 void TransliteratorTest::TestPropertySet() {
2455 expect(UNICODE_STRING_SIMPLE("a>A; \\p{Lu}>x; \\p{ANY}>y;"), "abcDEF", "Ayyxxx");
2456 expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
2457 "[ a stitch ]\n[ in time ]\r[ saves 9]");
2458 }
2459
2460 /**
2461 * Test various failure points of the new 2.0 engine.
2462 */
TestNewEngine()2463 void TransliteratorTest::TestNewEngine() {
2464 UParseError pe;
2465 UErrorCode ec = U_ZERO_ERROR;
2466 Transliterator *t = Transliterator::createInstance("Latin-Hiragana", UTRANS_FORWARD, pe, ec);
2467 if (t == 0 || U_FAILURE(ec)) {
2468 dataerrln("FAIL: createInstance Latin-Hiragana - %s", u_errorName(ec));
2469 return;
2470 }
2471 // Katakana should be untouched
2472 expect(*t, CharsToUnicodeString("a\\u3042\\u30A2"),
2473 CharsToUnicodeString("\\u3042\\u3042\\u30A2"));
2474
2475 delete t;
2476
2477 #if 1
2478 // This test will only work if Transliterator.ROLLBACK is
2479 // true. Otherwise, this test will fail, revealing a
2480 // limitation of global filters in incremental mode.
2481 Transliterator *a =
2482 Transliterator::createFromRules("a_to_A", "a > A;", UTRANS_FORWARD, pe, ec);
2483 Transliterator *A =
2484 Transliterator::createFromRules("A_to_b", "A > b;", UTRANS_FORWARD, pe, ec);
2485 if (U_FAILURE(ec)) {
2486 delete a;
2487 delete A;
2488 return;
2489 }
2490
2491 Transliterator* array[3];
2492 array[0] = a;
2493 array[1] = Transliterator::createInstance("NFD", UTRANS_FORWARD, pe, ec);
2494 array[2] = A;
2495 if (U_FAILURE(ec)) {
2496 errln("FAIL: createInstance NFD");
2497 delete a;
2498 delete A;
2499 delete array[1];
2500 return;
2501 }
2502
2503 t = new CompoundTransliterator(array, 3, new UnicodeSet("[:Ll:]", ec));
2504 if (U_FAILURE(ec)) {
2505 errln("FAIL: UnicodeSet constructor");
2506 delete a;
2507 delete A;
2508 delete array[1];
2509 delete t;
2510 return;
2511 }
2512
2513 expect(*t, "aAaA", "bAbA");
2514
2515 assertTrue("countElements", t->countElements() == 3);
2516 assertEquals("getElement(0)", t->getElement(0, ec).getID(), "a_to_A");
2517 assertEquals("getElement(1)", t->getElement(1, ec).getID(), "NFD");
2518 assertEquals("getElement(2)", t->getElement(2, ec).getID(), "A_to_b");
2519 assertSuccess("getElement", ec);
2520
2521 delete a;
2522 delete A;
2523 delete array[1];
2524 delete t;
2525 #endif
2526
2527 expect("$smooth = x; $macron = q; [:^L:] { ([aeiouyAEIOUY] $macron?) } [^aeiouyAEIOUY$smooth$macron] > | $1 $smooth ;",
2528 "a",
2529 "ax");
2530
2531 UnicodeString gr = CharsToUnicodeString(
2532 "$ddot = \\u0308 ;"
2533 "$lcgvowel = [\\u03b1\\u03b5\\u03b7\\u03b9\\u03bf\\u03c5\\u03c9] ;"
2534 "$rough = \\u0314 ;"
2535 "($lcgvowel+ $ddot?) $rough > h | $1 ;"
2536 "\\u03b1 <> a ;"
2537 "$rough <> h ;");
2538
2539 expect(gr, CharsToUnicodeString("\\u03B1\\u0314"), "ha");
2540 }
2541
2542 /**
2543 * Test quantified segment behavior. We want:
2544 * ([abc])+ > x $1 x; applied to "cba" produces "xax"
2545 */
TestQuantifiedSegment(void)2546 void TransliteratorTest::TestQuantifiedSegment(void) {
2547 // The normal case
2548 expect("([abc]+) > x $1 x;", "cba", "xcbax");
2549
2550 // The tricky case; the quantifier is around the segment
2551 expect("([abc])+ > x $1 x;", "cba", "xax");
2552
2553 // Tricky case in reverse direction
2554 expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
2555
2556 // Check post-context segment
2557 expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
2558
2559 // Test toRule/toPattern for non-quantified segment.
2560 // Careful with spacing here.
2561 UnicodeString r("([a-c]){q} > x $1 x;");
2562 UParseError pe;
2563 UErrorCode ec = U_ZERO_ERROR;
2564 Transliterator* t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2565 if (U_FAILURE(ec)) {
2566 errln("FAIL: createFromRules");
2567 delete t;
2568 return;
2569 }
2570 UnicodeString rr;
2571 t->toRules(rr, true);
2572 if (r != rr) {
2573 errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2574 } else {
2575 logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2576 }
2577 delete t;
2578
2579 // Test toRule/toPattern for quantified segment.
2580 // Careful with spacing here.
2581 r = "([a-c])+{q} > x $1 x;";
2582 t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2583 if (U_FAILURE(ec)) {
2584 errln("FAIL: createFromRules");
2585 delete t;
2586 return;
2587 }
2588 t->toRules(rr, true);
2589 if (r != rr) {
2590 errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2591 } else {
2592 logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2593 }
2594 delete t;
2595 }
2596
2597 //======================================================================
2598 // Ram's tests
2599 //======================================================================
TestDevanagariLatinRT()2600 void TransliteratorTest::TestDevanagariLatinRT(){
2601 const int MAX_LEN= 52;
2602 const char* const source[MAX_LEN] = {
2603 "bh\\u0101rata",
2604 "kra",
2605 "k\\u1E63a",
2606 "khra",
2607 "gra",
2608 "\\u1E45ra",
2609 "cra",
2610 "chra",
2611 "j\\u00F1a",
2612 "jhra",
2613 "\\u00F1ra",
2614 "\\u1E6Dya",
2615 "\\u1E6Dhra",
2616 "\\u1E0Dya",
2617 //"r\\u0323ya", // \u095c is not valid in Devanagari
2618 "\\u1E0Dhya",
2619 "\\u1E5Bhra",
2620 "\\u1E47ra",
2621 "tta",
2622 "thra",
2623 "dda",
2624 "dhra",
2625 "nna",
2626 "pra",
2627 "phra",
2628 "bra",
2629 "bhra",
2630 "mra",
2631 "\\u1E49ra",
2632 //"l\\u0331ra",
2633 "yra",
2634 "\\u1E8Fra",
2635 //"l-",
2636 "vra",
2637 "\\u015Bra",
2638 "\\u1E63ra",
2639 "sra",
2640 "hma",
2641 "\\u1E6D\\u1E6Da",
2642 "\\u1E6D\\u1E6Dha",
2643 "\\u1E6Dh\\u1E6Dha",
2644 "\\u1E0D\\u1E0Da",
2645 "\\u1E0D\\u1E0Dha",
2646 "\\u1E6Dya",
2647 "\\u1E6Dhya",
2648 "\\u1E0Dya",
2649 "\\u1E0Dhya",
2650 // Not roundtrippable --
2651 // \\u0939\\u094d\\u094d\\u092E - hma
2652 // \\u0939\\u094d\\u092E - hma
2653 // CharsToUnicodeString("hma"),
2654 "hya",
2655 "\\u015Br\\u0325",
2656 "\\u015Bca",
2657 "\\u0115",
2658 "san\\u0304j\\u012Bb s\\u0113nagupta",
2659 "\\u0101nand vaddir\\u0101ju",
2660 "\\u0101",
2661 "a"
2662 };
2663 const char* const expected[MAX_LEN] = {
2664 "\\u092D\\u093E\\u0930\\u0924", /* bha\\u0304rata */
2665 "\\u0915\\u094D\\u0930", /* kra */
2666 "\\u0915\\u094D\\u0937", /* ks\\u0323a */
2667 "\\u0916\\u094D\\u0930", /* khra */
2668 "\\u0917\\u094D\\u0930", /* gra */
2669 "\\u0919\\u094D\\u0930", /* n\\u0307ra */
2670 "\\u091A\\u094D\\u0930", /* cra */
2671 "\\u091B\\u094D\\u0930", /* chra */
2672 "\\u091C\\u094D\\u091E", /* jn\\u0303a */
2673 "\\u091D\\u094D\\u0930", /* jhra */
2674 "\\u091E\\u094D\\u0930", /* n\\u0303ra */
2675 "\\u091F\\u094D\\u092F", /* t\\u0323ya */
2676 "\\u0920\\u094D\\u0930", /* t\\u0323hra */
2677 "\\u0921\\u094D\\u092F", /* d\\u0323ya */
2678 //"\\u095C\\u094D\\u092F", /* r\\u0323ya */ // \u095c is not valid in Devanagari
2679 "\\u0922\\u094D\\u092F", /* d\\u0323hya */
2680 "\\u0922\\u093C\\u094D\\u0930", /* r\\u0323hra */
2681 "\\u0923\\u094D\\u0930", /* n\\u0323ra */
2682 "\\u0924\\u094D\\u0924", /* tta */
2683 "\\u0925\\u094D\\u0930", /* thra */
2684 "\\u0926\\u094D\\u0926", /* dda */
2685 "\\u0927\\u094D\\u0930", /* dhra */
2686 "\\u0928\\u094D\\u0928", /* nna */
2687 "\\u092A\\u094D\\u0930", /* pra */
2688 "\\u092B\\u094D\\u0930", /* phra */
2689 "\\u092C\\u094D\\u0930", /* bra */
2690 "\\u092D\\u094D\\u0930", /* bhra */
2691 "\\u092E\\u094D\\u0930", /* mra */
2692 "\\u0929\\u094D\\u0930", /* n\\u0331ra */
2693 //"\\u0934\\u094D\\u0930", /* l\\u0331ra */
2694 "\\u092F\\u094D\\u0930", /* yra */
2695 "\\u092F\\u093C\\u094D\\u0930", /* y\\u0307ra */
2696 //"l-",
2697 "\\u0935\\u094D\\u0930", /* vra */
2698 "\\u0936\\u094D\\u0930", /* s\\u0301ra */
2699 "\\u0937\\u094D\\u0930", /* s\\u0323ra */
2700 "\\u0938\\u094D\\u0930", /* sra */
2701 "\\u0939\\u094d\\u092E", /* hma */
2702 "\\u091F\\u094D\\u091F", /* t\\u0323t\\u0323a */
2703 "\\u091F\\u094D\\u0920", /* t\\u0323t\\u0323ha */
2704 "\\u0920\\u094D\\u0920", /* t\\u0323ht\\u0323ha*/
2705 "\\u0921\\u094D\\u0921", /* d\\u0323d\\u0323a */
2706 "\\u0921\\u094D\\u0922", /* d\\u0323d\\u0323ha */
2707 "\\u091F\\u094D\\u092F", /* t\\u0323ya */
2708 "\\u0920\\u094D\\u092F", /* t\\u0323hya */
2709 "\\u0921\\u094D\\u092F", /* d\\u0323ya */
2710 "\\u0922\\u094D\\u092F", /* d\\u0323hya */
2711 // "hma", /* hma */
2712 "\\u0939\\u094D\\u092F", /* hya */
2713 "\\u0936\\u0943", /* s\\u0301r\\u0325a */
2714 "\\u0936\\u094D\\u091A", /* s\\u0301ca */
2715 "\\u090d", /* e\\u0306 */
2716 "\\u0938\\u0902\\u091C\\u0940\\u092C\\u094D \\u0938\\u0947\\u0928\\u0917\\u0941\\u092A\\u094D\\u0924",
2717 "\\u0906\\u0928\\u0902\\u0926\\u094D \\u0935\\u0926\\u094D\\u0926\\u093F\\u0930\\u093E\\u091C\\u0941",
2718 "\\u0906",
2719 "\\u0905",
2720 };
2721 UErrorCode status = U_ZERO_ERROR;
2722 UParseError parseError;
2723 UnicodeString message;
2724 Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2725 Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2726 if(U_FAILURE(status)){
2727 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2728 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2729 return;
2730 }
2731 UnicodeString gotResult;
2732 for(int i= 0; i<MAX_LEN; i++){
2733 gotResult = source[i];
2734 expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2735 expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2736 }
2737 delete latinToDev;
2738 delete devToLatin;
2739 }
2740
TestTeluguLatinRT()2741 void TransliteratorTest::TestTeluguLatinRT(){
2742 const int MAX_LEN=10;
2743 const char* const source[MAX_LEN] = {
2744 "raghur\\u0101m vi\\u015Bvan\\u0101dha", /* Raghuram Viswanadha */
2745 "\\u0101nand vaddir\\u0101ju", /* Anand Vaddiraju */
2746 "r\\u0101j\\u012Bv ka\\u015Barab\\u0101da", /* Rajeev Kasarabada */
2747 "san\\u0304j\\u012Bv ka\\u015Barab\\u0101da", /* sanjeev kasarabada */
2748 "san\\u0304j\\u012Bb sen'gupta", /* sanjib sengupata */
2749 "amar\\u0113ndra hanum\\u0101nula", /* Amarendra hanumanula */
2750 "ravi kum\\u0101r vi\\u015Bvan\\u0101dha", /* Ravi Kumar Viswanadha */
2751 "\\u0101ditya kandr\\u0113gula", /* Aditya Kandregula */
2752 "\\u015Br\\u012Bdhar ka\\u1E47\\u1E6Dama\\u015Be\\u1E6D\\u1E6Di",/* Shridhar Kantamsetty */
2753 "m\\u0101dhav de\\u015Be\\u1E6D\\u1E6Di" /* Madhav Desetty */
2754 };
2755
2756 const char* const expected[MAX_LEN] = {
2757 "\\u0c30\\u0c18\\u0c41\\u0c30\\u0c3e\\u0c2e\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2758 "\\u0c06\\u0c28\\u0c02\\u0c26\\u0c4d \\u0C35\\u0C26\\u0C4D\\u0C26\\u0C3F\\u0C30\\u0C3E\\u0C1C\\u0C41",
2759 "\\u0c30\\u0c3e\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2760 "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2761 "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c2c\\u0c4d \\u0c38\\u0c46\\u0c28\\u0c4d\\u0c17\\u0c41\\u0c2a\\u0c4d\\u0c24",
2762 "\\u0c05\\u0c2e\\u0c30\\u0c47\\u0c02\\u0c26\\u0c4d\\u0c30 \\u0c39\\u0c28\\u0c41\\u0c2e\\u0c3e\\u0c28\\u0c41\\u0c32",
2763 "\\u0c30\\u0c35\\u0c3f \\u0c15\\u0c41\\u0c2e\\u0c3e\\u0c30\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2764 "\\u0c06\\u0c26\\u0c3f\\u0c24\\u0c4d\\u0c2f \\u0C15\\u0C02\\u0C26\\u0C4D\\u0C30\\u0C47\\u0C17\\u0C41\\u0c32",
2765 "\\u0c36\\u0c4d\\u0c30\\u0c40\\u0C27\\u0C30\\u0C4D \\u0c15\\u0c02\\u0c1f\\u0c2e\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2766 "\\u0c2e\\u0c3e\\u0c27\\u0c35\\u0c4d \\u0c26\\u0c46\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2767 };
2768
2769 UErrorCode status = U_ZERO_ERROR;
2770 UParseError parseError;
2771 UnicodeString message;
2772 Transliterator* latinToDev=Transliterator::createInstance("Latin-Telugu", UTRANS_FORWARD, parseError, status);
2773 Transliterator* devToLatin=Transliterator::createInstance("Telugu-Latin", UTRANS_FORWARD, parseError, status);
2774 if(U_FAILURE(status)){
2775 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2776 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2777 return;
2778 }
2779 UnicodeString gotResult;
2780 for(int i= 0; i<MAX_LEN; i++){
2781 gotResult = source[i];
2782 expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2783 expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2784 }
2785 delete latinToDev;
2786 delete devToLatin;
2787 }
2788
TestSanskritLatinRT()2789 void TransliteratorTest::TestSanskritLatinRT(){
2790 const int MAX_LEN =16;
2791 const char* const source[MAX_LEN] = {
2792 "rmk\\u1E63\\u0113t",
2793 "\\u015Br\\u012Bmad",
2794 "bhagavadg\\u012Bt\\u0101",
2795 "adhy\\u0101ya",
2796 "arjuna",
2797 "vi\\u1E63\\u0101da",
2798 "y\\u014Dga",
2799 "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2800 "uv\\u0101cr\\u0325",
2801 "dharmak\\u1E63\\u0113tr\\u0113",
2802 "kuruk\\u1E63\\u0113tr\\u0113",
2803 "samav\\u0113t\\u0101",
2804 "yuyutsava\\u1E25",
2805 "m\\u0101mak\\u0101\\u1E25",
2806 // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2807 "kimakurvata",
2808 "san\\u0304java",
2809 };
2810 const char* const expected[MAX_LEN] = {
2811 "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2812 "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2813 "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2814 "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2815 "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2816 "\\u0935\\u093f\\u0937\\u093e\\u0926",
2817 "\\u092f\\u094b\\u0917",
2818 "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2819 "\\u0909\\u0935\\u093E\\u091A\\u0943",
2820 "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2821 "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2822 "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2823 "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2824 "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2825 //"\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2826 "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2827 "\\u0938\\u0902\\u091c\\u0935",
2828 };
2829 UErrorCode status = U_ZERO_ERROR;
2830 UParseError parseError;
2831 UnicodeString message;
2832 Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2833 Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2834 if(U_FAILURE(status)){
2835 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2836 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2837 return;
2838 }
2839 UnicodeString gotResult;
2840 for(int i= 0; i<MAX_LEN; i++){
2841 gotResult = source[i];
2842 expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2843 expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2844 }
2845 delete latinToDev;
2846 delete devToLatin;
2847 }
2848
2849
TestCompoundLatinRT()2850 void TransliteratorTest::TestCompoundLatinRT(){
2851 const char* const source[] = {
2852 "rmk\\u1E63\\u0113t",
2853 "\\u015Br\\u012Bmad",
2854 "bhagavadg\\u012Bt\\u0101",
2855 "adhy\\u0101ya",
2856 "arjuna",
2857 "vi\\u1E63\\u0101da",
2858 "y\\u014Dga",
2859 "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2860 "uv\\u0101cr\\u0325",
2861 "dharmak\\u1E63\\u0113tr\\u0113",
2862 "kuruk\\u1E63\\u0113tr\\u0113",
2863 "samav\\u0113t\\u0101",
2864 "yuyutsava\\u1E25",
2865 "m\\u0101mak\\u0101\\u1E25",
2866 // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2867 "kimakurvata",
2868 "san\\u0304java"
2869 };
2870 const int MAX_LEN = UPRV_LENGTHOF(source);
2871 const char* const expected[MAX_LEN] = {
2872 "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2873 "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2874 "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2875 "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2876 "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2877 "\\u0935\\u093f\\u0937\\u093e\\u0926",
2878 "\\u092f\\u094b\\u0917",
2879 "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2880 "\\u0909\\u0935\\u093E\\u091A\\u0943",
2881 "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2882 "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2883 "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2884 "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2885 "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2886 // "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2887 "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2888 "\\u0938\\u0902\\u091c\\u0935"
2889 };
2890 if(MAX_LEN != UPRV_LENGTHOF(expected)) {
2891 errln("error in TestCompoundLatinRT: source[] and expected[] have different lengths!");
2892 return;
2893 }
2894
2895 UErrorCode status = U_ZERO_ERROR;
2896 UParseError parseError;
2897 UnicodeString message;
2898 Transliterator* devToLatinToDev =Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2899 Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2900 Transliterator* devToTelToDev =Transliterator::createInstance("Devanagari-Telugu;Telugu-Devanagari", UTRANS_FORWARD, parseError, status);
2901 Transliterator* latinToTelToLatin=Transliterator::createInstance("Latin-Telugu;Telugu-Latin", UTRANS_FORWARD, parseError, status);
2902
2903 if(U_FAILURE(status)){
2904 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2905 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2906 return;
2907 }
2908 UnicodeString gotResult;
2909 for(int i= 0; i<MAX_LEN; i++){
2910 gotResult = source[i];
2911 expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2912 expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2913 expect(*latinToTelToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2914
2915 }
2916 delete(latinToDevToLatin);
2917 delete(devToLatinToDev);
2918 delete(devToTelToDev);
2919 delete(latinToTelToLatin);
2920 }
2921
2922 /**
2923 * Test Gurmukhi-Devanagari Tippi and Bindi
2924 */
TestGurmukhiDevanagari()2925 void TransliteratorTest::TestGurmukhiDevanagari(){
2926 // the rule says:
2927 // (\u0902) (when preceded by vowel) ---> (\u0A02)
2928 // (\u0902) (when preceded by consonant) ---> (\u0A70)
2929 UErrorCode status = U_ZERO_ERROR;
2930 UnicodeSet vowel(UnicodeString("[\\u0905-\\u090A \\u090F\\u0910\\u0913\\u0914 \\u093e-\\u0942\\u0947\\u0948\\u094B\\u094C\\u094D]", -1, US_INV).unescape(), status);
2931 UnicodeSet non_vowel(UnicodeString("[\\u0915-\\u0928\\u092A-\\u0930]", -1, US_INV).unescape(), status);
2932 UParseError parseError;
2933
2934 UnicodeSetIterator vIter(vowel);
2935 UnicodeSetIterator nvIter(non_vowel);
2936 Transliterator* trans = Transliterator::createInstance("Devanagari-Gurmukhi",UTRANS_FORWARD, parseError, status);
2937 if(U_FAILURE(status)) {
2938 dataerrln("Error creating transliterator %s", u_errorName(status));
2939 delete trans;
2940 return;
2941 }
2942 UnicodeString src (" \\u0902", -1, US_INV);
2943 UnicodeString expected(" \\u0A02", -1, US_INV);
2944 src = src.unescape();
2945 expected= expected.unescape();
2946
2947 while(vIter.next()){
2948 src.setCharAt(0,(UChar) vIter.getCodepoint());
2949 expected.setCharAt(0,(UChar) (vIter.getCodepoint()+0x0100));
2950 expect(*trans,src,expected);
2951 }
2952
2953 expected.setCharAt(1,0x0A70);
2954 while(nvIter.next()){
2955 //src.setCharAt(0,(char) nvIter.codepoint);
2956 src.setCharAt(0,(UChar)nvIter.getCodepoint());
2957 expected.setCharAt(0,(UChar) (nvIter.getCodepoint()+0x0100));
2958 expect(*trans,src,expected);
2959 }
2960 delete trans;
2961 }
2962 /**
2963 * Test instantiation from a locale.
2964 */
TestLocaleInstantiation(void)2965 void TransliteratorTest::TestLocaleInstantiation(void) {
2966 UParseError pe;
2967 UErrorCode ec = U_ZERO_ERROR;
2968 Transliterator *t = Transliterator::createInstance("ru_RU-Latin", UTRANS_FORWARD, pe, ec);
2969 if (U_FAILURE(ec)) {
2970 dataerrln("FAIL: createInstance(ru_RU-Latin) - %s", u_errorName(ec));
2971 delete t;
2972 return;
2973 }
2974 expect(*t, CharsToUnicodeString("\\u0430"), "a");
2975 delete t;
2976
2977 t = Transliterator::createInstance("en-el", UTRANS_FORWARD, pe, ec);
2978 if (U_FAILURE(ec)) {
2979 errln("FAIL: createInstance(en-el)");
2980 delete t;
2981 return;
2982 }
2983 expect(*t, "a", CharsToUnicodeString("\\u03B1"));
2984 delete t;
2985 }
2986
2987 /**
2988 * Test title case handling of accent (should ignore accents)
2989 */
TestTitleAccents(void)2990 void TransliteratorTest::TestTitleAccents(void) {
2991 UParseError pe;
2992 UErrorCode ec = U_ZERO_ERROR;
2993 Transliterator *t = Transliterator::createInstance("Title", UTRANS_FORWARD, pe, ec);
2994 if (U_FAILURE(ec)) {
2995 errln("FAIL: createInstance(Title)");
2996 delete t;
2997 return;
2998 }
2999 expect(*t, CharsToUnicodeString("a\\u0300b can't abe"), CharsToUnicodeString("A\\u0300b Can't Abe"));
3000 delete t;
3001 }
3002
3003 /**
3004 * Basic test of a locale resource based rule.
3005 */
TestLocaleResource()3006 void TransliteratorTest::TestLocaleResource() {
3007 const char* DATA[] = {
3008 // id from to
3009 //"Latin-Greek/UNGEGN", "b", "\\u03bc\\u03c0",
3010 "Latin-el", "b", "\\u03bc\\u03c0",
3011 "Latin-Greek", "b", "\\u03B2",
3012 "Greek-Latin/UNGEGN", "\\u03B2", "v",
3013 "el-Latin", "\\u03B2", "v",
3014 "Greek-Latin", "\\u03B2", "b",
3015 };
3016 const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3017 for (int32_t i=0; i<DATA_length; i+=3) {
3018 UParseError pe;
3019 UErrorCode ec = U_ZERO_ERROR;
3020 Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, pe, ec);
3021 if (U_FAILURE(ec)) {
3022 dataerrln((UnicodeString)"FAIL: createInstance(" + DATA[i] + ") - " + u_errorName(ec));
3023 delete t;
3024 continue;
3025 }
3026 expect(*t, CharsToUnicodeString(DATA[i+1]),
3027 CharsToUnicodeString(DATA[i+2]));
3028 delete t;
3029 }
3030 }
3031
3032 /**
3033 * Make sure parse errors reference the right line.
3034 */
TestParseError()3035 void TransliteratorTest::TestParseError() {
3036 static const char* rule =
3037 "a > b;\n"
3038 "# more stuff\n"
3039 "d << b;";
3040 UErrorCode ec = U_ZERO_ERROR;
3041 UParseError pe;
3042 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3043 delete t;
3044 if (U_FAILURE(ec)) {
3045 UnicodeString err(pe.preContext);
3046 err.append((UChar)124/*|*/).append(pe.postContext);
3047 if (err.indexOf("d << b") >= 0) {
3048 logln("Ok: " + err);
3049 } else {
3050 errln("FAIL: " + err);
3051 }
3052 }
3053 else {
3054 errln("FAIL: no syntax error");
3055 }
3056 static const char* maskingRule =
3057 "a>x;\n"
3058 "# more stuff\n"
3059 "ab>y;";
3060 ec = U_ZERO_ERROR;
3061 delete Transliterator::createFromRules("ID", maskingRule, UTRANS_FORWARD, pe, ec);
3062 if (ec != U_RULE_MASK_ERROR) {
3063 errln("FAIL: returned %s instead of U_RULE_MASK_ERROR", u_errorName(ec));
3064 }
3065 else if (UnicodeString("a > x;") != UnicodeString(pe.preContext)) {
3066 errln("FAIL: did not get expected precontext");
3067 }
3068 else if (UnicodeString("ab > y;") != UnicodeString(pe.postContext)) {
3069 errln("FAIL: did not get expected postcontext");
3070 }
3071 }
3072
3073 /**
3074 * Make sure sets on output are disallowed.
3075 */
TestOutputSet()3076 void TransliteratorTest::TestOutputSet() {
3077 UnicodeString rule = "$set = [a-cm-n]; b > $set;";
3078 UErrorCode ec = U_ZERO_ERROR;
3079 UParseError pe;
3080 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3081 delete t;
3082 if (U_FAILURE(ec)) {
3083 UnicodeString err(pe.preContext);
3084 err.append((UChar)124/*|*/).append(pe.postContext);
3085 logln("Ok: " + err);
3086 return;
3087 }
3088 errln("FAIL: No syntax error");
3089 }
3090
3091 /**
3092 * Test the use variable range pragma, making sure that use of
3093 * variable range characters is detected and flagged as an error.
3094 */
TestVariableRange()3095 void TransliteratorTest::TestVariableRange() {
3096 UnicodeString rule = "use variable range 0x70 0x72; a > A; b > B; q > Q;";
3097 UErrorCode ec = U_ZERO_ERROR;
3098 UParseError pe;
3099 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3100 delete t;
3101 if (U_FAILURE(ec)) {
3102 UnicodeString err(pe.preContext);
3103 err.append((UChar)124/*|*/).append(pe.postContext);
3104 logln("Ok: " + err);
3105 return;
3106 }
3107 errln("FAIL: No syntax error");
3108 }
3109
3110 /**
3111 * Test invalid post context error handling
3112 */
TestInvalidPostContext()3113 void TransliteratorTest::TestInvalidPostContext() {
3114 UnicodeString rule = "a}b{c>d;";
3115 UErrorCode ec = U_ZERO_ERROR;
3116 UParseError pe;
3117 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3118 delete t;
3119 if (U_FAILURE(ec)) {
3120 UnicodeString err(pe.preContext);
3121 err.append((UChar)124/*|*/).append(pe.postContext);
3122 if (err.indexOf("a}b{c") >= 0) {
3123 logln("Ok: " + err);
3124 } else {
3125 errln("FAIL: " + err);
3126 }
3127 return;
3128 }
3129 errln("FAIL: No syntax error");
3130 }
3131
3132 /**
3133 * Test ID form variants
3134 */
TestIDForms()3135 void TransliteratorTest::TestIDForms() {
3136 const char* DATA[] = {
3137 "NFC", NULL, "NFD",
3138 "nfd", NULL, "NFC", // make sure case is ignored
3139 "Any-NFKD", NULL, "Any-NFKC",
3140 "Null", NULL, "Null",
3141 "-nfkc", "nfkc", "NFKD",
3142 "-nfkc/", "nfkc", "NFKD",
3143 "Latin-Greek/UNGEGN", NULL, "Greek-Latin/UNGEGN",
3144 "Greek/UNGEGN-Latin", "Greek-Latin/UNGEGN", "Latin-Greek/UNGEGN",
3145 "Bengali-Devanagari/", "Bengali-Devanagari", "Devanagari-Bengali",
3146 "Source-", NULL, NULL,
3147 "Source/Variant-", NULL, NULL,
3148 "Source-/Variant", NULL, NULL,
3149 "/Variant", NULL, NULL,
3150 "/Variant-", NULL, NULL,
3151 "-/Variant", NULL, NULL,
3152 "-/", NULL, NULL,
3153 "-", NULL, NULL,
3154 "/", NULL, NULL,
3155 };
3156 const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3157
3158 for (int32_t i=0; i<DATA_length; i+=3) {
3159 const char* ID = DATA[i];
3160 const char* expID = DATA[i+1];
3161 const char* expInvID = DATA[i+2];
3162 UBool expValid = (expInvID != NULL);
3163 if (expID == NULL) {
3164 expID = ID;
3165 }
3166 UParseError pe;
3167 UErrorCode ec = U_ZERO_ERROR;
3168 Transliterator *t =
3169 Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
3170 if (U_FAILURE(ec)) {
3171 if (!expValid) {
3172 logln((UnicodeString)"Ok: getInstance(" + ID +") => " + u_errorName(ec));
3173 } else {
3174 dataerrln((UnicodeString)"FAIL: Couldn't create " + ID + " - " + u_errorName(ec));
3175 }
3176 delete t;
3177 continue;
3178 }
3179 Transliterator *u = t->createInverse(ec);
3180 if (U_FAILURE(ec)) {
3181 errln((UnicodeString)"FAIL: Couldn't create inverse of " + ID);
3182 delete t;
3183 delete u;
3184 continue;
3185 }
3186 if (t->getID() == expID &&
3187 u->getID() == expInvID) {
3188 logln((UnicodeString)"Ok: " + ID + ".getInverse() => " + expInvID);
3189 } else {
3190 errln((UnicodeString)"FAIL: getInstance(" + ID + ") => " +
3191 t->getID() + " x getInverse() => " + u->getID() +
3192 ", expected " + expInvID);
3193 }
3194 delete t;
3195 delete u;
3196 }
3197 }
3198
3199 static const UChar SPACE[] = {32,0};
3200 static const UChar NEWLINE[] = {10,0};
3201 static const UChar RETURN[] = {13,0};
3202 static const UChar EMPTY[] = {0};
3203
checkRules(const UnicodeString & label,Transliterator & t2,const UnicodeString & testRulesForward)3204 void TransliteratorTest::checkRules(const UnicodeString& label, Transliterator& t2,
3205 const UnicodeString& testRulesForward) {
3206 UnicodeString rules2; t2.toRules(rules2, true);
3207 //rules2 = TestUtility.replaceAll(rules2, new UnicodeSet("[' '\n\r]"), "");
3208 rules2.findAndReplace(SPACE, EMPTY);
3209 rules2.findAndReplace(NEWLINE, EMPTY);
3210 rules2.findAndReplace(RETURN, EMPTY);
3211
3212 UnicodeString testRules(testRulesForward); testRules.findAndReplace(SPACE, EMPTY);
3213
3214 if (rules2 != testRules) {
3215 errln(label);
3216 logln((UnicodeString)"GENERATED RULES: " + rules2);
3217 logln((UnicodeString)"SHOULD BE: " + testRulesForward);
3218 }
3219 }
3220
3221 /**
3222 * Mark's toRules test.
3223 */
TestToRulesMark()3224 void TransliteratorTest::TestToRulesMark() {
3225 const char* testRules =
3226 "::[[:Latin:][:Mark:]];"
3227 "::NFKD (NFC);"
3228 "::Lower (Lower);"
3229 "a <> \\u03B1;" // alpha
3230 "::NFKC (NFD);"
3231 "::Upper (Lower);"
3232 "::Lower ();"
3233 "::([[:Greek:][:Mark:]]);"
3234 ;
3235 const char* testRulesForward =
3236 "::[[:Latin:][:Mark:]];"
3237 "::NFKD(NFC);"
3238 "::Lower(Lower);"
3239 "a > \\u03B1;"
3240 "::NFKC(NFD);"
3241 "::Upper (Lower);"
3242 "::Lower ();"
3243 ;
3244 const char* testRulesBackward =
3245 "::[[:Greek:][:Mark:]];"
3246 "::Lower (Upper);"
3247 "::NFD(NFKC);"
3248 "\\u03B1 > a;"
3249 "::Lower(Lower);"
3250 "::NFC(NFKD);"
3251 ;
3252 UnicodeString source = CharsToUnicodeString("\\u00E1"); // a-acute
3253 UnicodeString target = CharsToUnicodeString("\\u03AC"); // alpha-acute
3254
3255 UParseError pe;
3256 UErrorCode ec = U_ZERO_ERROR;
3257 LocalPointer<Transliterator> t2(
3258 Transliterator::createFromRules("source-target", UnicodeString(testRules, -1, US_INV), UTRANS_FORWARD, pe, ec));
3259 LocalPointer<Transliterator> t3(
3260 Transliterator::createFromRules("target-source", UnicodeString(testRules, -1, US_INV), UTRANS_REVERSE, pe, ec));
3261
3262 if (U_FAILURE(ec)) {
3263 dataerrln((UnicodeString)"FAIL: createFromRules => " + u_errorName(ec));
3264 return;
3265 }
3266
3267 expect(*t2, source, target);
3268 expect(*t3, target, source);
3269
3270 checkRules("Failed toRules FORWARD", *t2, UnicodeString(testRulesForward, -1, US_INV));
3271 checkRules("Failed toRules BACKWARD", *t3, UnicodeString(testRulesBackward, -1, US_INV));
3272 }
3273
3274 /**
3275 * Test Escape and Unescape transliterators.
3276 */
TestEscape()3277 void TransliteratorTest::TestEscape() {
3278 UParseError pe;
3279 UErrorCode ec;
3280 Transliterator *t;
3281
3282 ec = U_ZERO_ERROR;
3283 t = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, pe, ec);
3284 if (U_FAILURE(ec)) {
3285 errln((UnicodeString)"FAIL: createInstance");
3286 } else {
3287 expect(*t,
3288 UNICODE_STRING_SIMPLE("\\x{40}\\U000000312Q"),
3289 "@12Q");
3290 }
3291 delete t;
3292
3293 ec = U_ZERO_ERROR;
3294 t = Transliterator::createInstance("Any-Hex/C", UTRANS_FORWARD, pe, ec);
3295 if (U_FAILURE(ec)) {
3296 errln((UnicodeString)"FAIL: createInstance");
3297 } else {
3298 expect(*t,
3299 CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3300 UNICODE_STRING_SIMPLE("\\u0041\\U0010BEEF\\uFEED"));
3301 }
3302 delete t;
3303
3304 ec = U_ZERO_ERROR;
3305 t = Transliterator::createInstance("Any-Hex/Java", UTRANS_FORWARD, pe, ec);
3306 if (U_FAILURE(ec)) {
3307 errln((UnicodeString)"FAIL: createInstance");
3308 } else {
3309 expect(*t,
3310 CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3311 UNICODE_STRING_SIMPLE("\\u0041\\uDBEF\\uDEEF\\uFEED"));
3312 }
3313 delete t;
3314
3315 ec = U_ZERO_ERROR;
3316 t = Transliterator::createInstance("Any-Hex/Perl", UTRANS_FORWARD, pe, ec);
3317 if (U_FAILURE(ec)) {
3318 errln((UnicodeString)"FAIL: createInstance");
3319 } else {
3320 expect(*t,
3321 CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3322 UNICODE_STRING_SIMPLE("\\x{41}\\x{10BEEF}\\x{FEED}"));
3323 }
3324 delete t;
3325 }
3326
3327
TestAnchorMasking()3328 void TransliteratorTest::TestAnchorMasking(){
3329 UnicodeString rule ("^a > Q; a > q;");
3330 UErrorCode status= U_ZERO_ERROR;
3331 UParseError parseError;
3332
3333 Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status);
3334 if(U_FAILURE(status)){
3335 errln(UnicodeString("FAIL: ") + "ID" +
3336 ".createFromRules() => bad rules" +
3337 /*", parse error " + parseError.code +*/
3338 ", line " + parseError.line +
3339 ", offset " + parseError.offset +
3340 ", context " + prettify(parseError.preContext, true) +
3341 ", rules: " + prettify(rule, true));
3342 }
3343 delete t;
3344 }
3345
3346 /**
3347 * Make sure display names of variants look reasonable.
3348 */
TestDisplayName()3349 void TransliteratorTest::TestDisplayName() {
3350 #if UCONFIG_NO_FORMATTING
3351 logln("Skipping, UCONFIG_NO_FORMATTING is set\n");
3352 return;
3353 #else
3354 static const char* DATA[] = {
3355 // ID, forward name, reverse name
3356 // Update the text as necessary -- the important thing is
3357 // not the text itself, but how various cases are handled.
3358
3359 // Basic test
3360 "Any-Hex", "Any to Hex Escape", "Hex Escape to Any",
3361
3362 // Variants
3363 "Any-Hex/Perl", "Any to Hex Escape/Perl", "Hex Escape to Any/Perl",
3364
3365 // Target-only IDs
3366 "NFC", "Any to NFC", "Any to NFD",
3367 };
3368
3369 int32_t DATA_length = UPRV_LENGTHOF(DATA);
3370
3371 Locale US("en", "US");
3372
3373 for (int32_t i=0; i<DATA_length; i+=3) {
3374 UnicodeString name;
3375 Transliterator::getDisplayName(DATA[i], US, name);
3376 if (name != DATA[i+1]) {
3377 dataerrln((UnicodeString)"FAIL: " + DATA[i] + ".getDisplayName() => " +
3378 name + ", expected " + DATA[i+1]);
3379 } else {
3380 logln((UnicodeString)"Ok: " + DATA[i] + ".getDisplayName() => " + name);
3381 }
3382 UErrorCode ec = U_ZERO_ERROR;
3383 UParseError pe;
3384 Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_REVERSE, pe, ec);
3385 if (U_FAILURE(ec)) {
3386 delete t;
3387 dataerrln("FAIL: createInstance failed - %s", u_errorName(ec));
3388 continue;
3389 }
3390 name = Transliterator::getDisplayName(t->getID(), US, name);
3391 if (name != DATA[i+2]) {
3392 dataerrln((UnicodeString)"FAIL: " + t->getID() + ".getDisplayName() => " +
3393 name + ", expected " + DATA[i+2]);
3394 } else {
3395 logln((UnicodeString)"Ok: " + t->getID() + ".getDisplayName() => " + name);
3396 }
3397 delete t;
3398 }
3399 #endif
3400 }
3401
TestSpecialCases(void)3402 void TransliteratorTest::TestSpecialCases(void) {
3403 const UnicodeString registerRules[] = {
3404 "Any-Dev1", "x > X; y > Y;",
3405 "Any-Dev2", "XY > Z",
3406 "Greek-Latin/FAKE",
3407 CharsToUnicodeString
3408 ("[^[:L:][:M:]] { \\u03bc\\u03c0 > b ; \\u03bc\\u03c0 } [^[:L:][:M:]] > b ; [^[:L:][:M:]] { [\\u039c\\u03bc][\\u03a0\\u03c0] > B ; [\\u039c\\u03bc][\\u03a0\\u03c0] } [^[:L:][:M:]] > B ;"),
3409 "" // END MARKER
3410 };
3411
3412 const UnicodeString testCases[] = {
3413 // NORMALIZATION
3414 // should add more test cases
3415 "NFD" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3416 "NFC" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3417 "NFKD", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3418 "NFKC", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3419
3420 // mp -> b BUG
3421 "Greek-Latin/UNGEGN", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3422 "Greek-Latin/FAKE", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3423
3424 // check for devanagari bug
3425 "nfd;Dev1;Dev2;nfc", "xy", "Z",
3426
3427 // ff, i, dotless-i, I, dotted-I, LJLjlj deseret deeDEE
3428 "Title", CharsToUnicodeString("ab'cD ffi\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3429 CharsToUnicodeString("Ab'cd Ffi\\u0131ii\\u0307 \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3430
3431 //TODO: enable this test once Titlecase works right
3432 /*
3433 "Title", CharsToUnicodeString("\\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3434 CharsToUnicodeString("Ffi\\u0131ii \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3435 */
3436 "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3437 CharsToUnicodeString("AB'CD FFIII\\u0130 \\u01C7\\u01C7\\u01C7 ") + DESERET_DEE + DESERET_DEE,
3438 "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3439 CharsToUnicodeString("ab'cd \\uFB00i\\u0131ii\\u0307 \\u01C9\\u01C9\\u01C9 ") + DESERET_dee + DESERET_dee,
3440
3441 "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3442 "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3443
3444 // FORMS OF S
3445 "Greek-Latin/UNGEGN", CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3446 CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3447 "Latin-Greek/UNGEGN", CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3448 CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3") ,
3449 "Greek-Latin", CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3450 CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3451 "Latin-Greek", CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3452 CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3453 // Tatiana bug
3454 // Upper: TAT\\u02B9\\u00C2NA
3455 // Lower: tat\\u02B9\\u00E2na
3456 // Title: Tat\\u02B9\\u00E2na
3457 "Upper", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3458 CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3459 "Lower", CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3460 CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3461 "Title", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3462 CharsToUnicodeString("Tat\\u02B9\\u00E2na"),
3463
3464 "" // END MARKER
3465 };
3466
3467 UParseError pos;
3468 int32_t i;
3469 for (i = 0; registerRules[i].length()!=0; i+=2) {
3470 UErrorCode status = U_ZERO_ERROR;
3471
3472 Transliterator *t = Transliterator::createFromRules(registerRules[0+i],
3473 registerRules[i+1], UTRANS_FORWARD, pos, status);
3474 if (U_FAILURE(status)) {
3475 dataerrln("Fails: Unable to create the transliterator from rules. - %s", u_errorName(status));
3476 } else {
3477 Transliterator::registerInstance(t);
3478 }
3479 }
3480 for (i = 0; testCases[i].length()!=0; i+=3) {
3481 UErrorCode ec = U_ZERO_ERROR;
3482 UParseError pe;
3483 const UnicodeString& name = testCases[i];
3484 Transliterator *t = Transliterator::createInstance(name, UTRANS_FORWARD, pe, ec);
3485 if (U_FAILURE(ec)) {
3486 dataerrln((UnicodeString)"FAIL: Couldn't create " + name + " - " + u_errorName(ec));
3487 delete t;
3488 continue;
3489 }
3490 const UnicodeString& id = t->getID();
3491 const UnicodeString& source = testCases[i+1];
3492 UnicodeString target;
3493
3494 // Automatic generation of targets, to make it simpler to add test cases (and more fail-safe)
3495
3496 if (testCases[i+2].length() > 0) {
3497 target = testCases[i+2];
3498 } else if (0==id.caseCompare("NFD", U_FOLD_CASE_DEFAULT)) {
3499 Normalizer::normalize(source, UNORM_NFD, 0, target, ec);
3500 } else if (0==id.caseCompare("NFC", U_FOLD_CASE_DEFAULT)) {
3501 Normalizer::normalize(source, UNORM_NFC, 0, target, ec);
3502 } else if (0==id.caseCompare("NFKD", U_FOLD_CASE_DEFAULT)) {
3503 Normalizer::normalize(source, UNORM_NFKD, 0, target, ec);
3504 } else if (0==id.caseCompare("NFKC", U_FOLD_CASE_DEFAULT)) {
3505 Normalizer::normalize(source, UNORM_NFKC, 0, target, ec);
3506 } else if (0==id.caseCompare("Lower", U_FOLD_CASE_DEFAULT)) {
3507 target = source;
3508 target.toLower(Locale::getUS());
3509 } else if (0==id.caseCompare("Upper", U_FOLD_CASE_DEFAULT)) {
3510 target = source;
3511 target.toUpper(Locale::getUS());
3512 }
3513 if (U_FAILURE(ec)) {
3514 errln((UnicodeString)"FAIL: Internal error normalizing " + source);
3515 continue;
3516 }
3517
3518 expect(*t, source, target);
3519 delete t;
3520 }
3521 for (i = 0; registerRules[i].length()!=0; i+=2) {
3522 Transliterator::unregister(registerRules[i]);
3523 }
3524 }
3525
Char32ToEscapedChars(UChar32 ch,char * buffer)3526 char* Char32ToEscapedChars(UChar32 ch, char* buffer) {
3527 if (ch <= 0xFFFF) {
3528 sprintf(buffer, "\\u%04x", (int)ch);
3529 } else {
3530 sprintf(buffer, "\\U%08x", (int)ch);
3531 }
3532 return buffer;
3533 }
3534
TestSurrogateCasing(void)3535 void TransliteratorTest::TestSurrogateCasing (void) {
3536 // check that casing handles surrogates
3537 // titlecase is currently defective
3538 char buffer[20];
3539 UChar buffer2[20];
3540 UChar32 dee;
3541 U16_GET(DESERET_dee,0, 0, DESERET_dee.length(), dee);
3542 UnicodeString DEE(u_totitle(dee));
3543 if (DEE != DESERET_DEE) {
3544 err("Fails titlecase of surrogates");
3545 err(Char32ToEscapedChars(dee, buffer));
3546 err(", ");
3547 errln(Char32ToEscapedChars(DEE.char32At(0), buffer));
3548 }
3549
3550 UnicodeString deeDEETest=DESERET_dee + DESERET_DEE;
3551 UnicodeString deedeeTest = DESERET_dee + DESERET_dee;
3552 UnicodeString DEEDEETest = DESERET_DEE + DESERET_DEE;
3553 UErrorCode status= U_ZERO_ERROR;
3554
3555 u_strToUpper(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
3556 if (U_FAILURE(status) || (UnicodeString(buffer2)!= DEEDEETest)) {
3557 errln("Fails: Can't uppercase surrogates.");
3558 }
3559
3560 status= U_ZERO_ERROR;
3561 u_strToLower(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
3562 if (U_FAILURE(status) || (UnicodeString(buffer2)!= deedeeTest)) {
3563 errln("Fails: Can't lowercase surrogates.");
3564 }
3565 }
3566
_trans(Transliterator & t,const UnicodeString & src,UnicodeString & result)3567 static void _trans(Transliterator& t, const UnicodeString& src,
3568 UnicodeString& result) {
3569 result = src;
3570 t.transliterate(result);
3571 }
3572
_trans(const UnicodeString & id,const UnicodeString & src,UnicodeString & result,UErrorCode ec)3573 static void _trans(const UnicodeString& id, const UnicodeString& src,
3574 UnicodeString& result, UErrorCode ec) {
3575 UParseError pe;
3576 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
3577 if (U_SUCCESS(ec)) {
3578 _trans(*t, src, result);
3579 }
3580 delete t;
3581 }
3582
_findMatch(const UnicodeString & source,const UnicodeString * pairs)3583 static UnicodeString _findMatch(const UnicodeString& source,
3584 const UnicodeString* pairs) {
3585 UnicodeString empty;
3586 for (int32_t i=0; pairs[i].length() > 0; i+=2) {
3587 if (0==source.caseCompare(pairs[i], U_FOLD_CASE_DEFAULT)) {
3588 return pairs[i+1];
3589 }
3590 }
3591 return empty;
3592 }
3593
3594 // Check to see that incremental gets at least part way through a reasonable string.
3595
TestIncrementalProgress(void)3596 void TransliteratorTest::TestIncrementalProgress(void) {
3597 UErrorCode ec = U_ZERO_ERROR;
3598 UnicodeString latinTest = "The Quick Brown Fox.";
3599 UnicodeString devaTest;
3600 _trans("Latin-Devanagari", latinTest, devaTest, ec);
3601 UnicodeString kataTest;
3602 _trans("Latin-Katakana", latinTest, kataTest, ec);
3603 if (U_FAILURE(ec)) {
3604 errln("FAIL: Internal error");
3605 return;
3606 }
3607 const UnicodeString tests[] = {
3608 "Any", latinTest,
3609 "Latin", latinTest,
3610 "Halfwidth", latinTest,
3611 "Devanagari", devaTest,
3612 "Katakana", kataTest,
3613 "" // END MARKER
3614 };
3615
3616 UnicodeString test("The Quick Brown Fox Jumped Over The Lazy Dog.");
3617 int32_t i = 0, j=0, k=0;
3618 int32_t sources = Transliterator::countAvailableSources();
3619 for (i = 0; i < sources; i++) {
3620 UnicodeString source;
3621 Transliterator::getAvailableSource(i, source);
3622 UnicodeString test = _findMatch(source, tests);
3623 if (test.length() == 0) {
3624 logln((UnicodeString)"Skipping " + source + "-X");
3625 continue;
3626 }
3627 int32_t targets = Transliterator::countAvailableTargets(source);
3628 for (j = 0; j < targets; j++) {
3629 UnicodeString target;
3630 Transliterator::getAvailableTarget(j, source, target);
3631 int32_t variants = Transliterator::countAvailableVariants(source, target);
3632 for (k =0; k< variants; k++) {
3633 UnicodeString variant;
3634 UParseError err;
3635 UErrorCode status = U_ZERO_ERROR;
3636
3637 Transliterator::getAvailableVariant(k, source, target, variant);
3638 UnicodeString id = source + "-" + target + "/" + variant;
3639
3640 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, err, status);
3641 if (U_FAILURE(status)) {
3642 dataerrln((UnicodeString)"FAIL: Could not create " + id + ", status " + u_errorName(status));
3643 delete t;
3644 continue;
3645 }
3646 status = U_ZERO_ERROR;
3647 CheckIncrementalAux(t, test);
3648
3649 UnicodeString rev;
3650 _trans(*t, test, rev);
3651 Transliterator *inv = t->createInverse(status);
3652 if (U_FAILURE(status)) {
3653 // The following are forward-only, it is OK that creating an inverse will not work:
3654 // 1. Devanagari-Arabic
3655 // 2. Any-*/BGN
3656 // 2a. Any-*/BGN_1981
3657 // 3. Any-*/MNS
3658 //
3659 // 4. If UCONFIG_NO_BREAK_ITERATION is on, Latin-Thai is also not expected to work.
3660 //
3661 // The following are direction="both" transforms with variants, inverting the Any-Xxxx/Variant for
3662 // any of these does not work; see ICU-21911 (not sure whether this is intentional or an ICU bug).
3663 // Unfortunately we do not easily have the info at this point as to whether the original transform
3664 // had direction="both" specified.
3665 // 5. Any-*/UNGEGN
3666 // 6. Any-Ethiopic/*
3667 // 7. Any-Braille/*
3668 // 8. Any-*/Gurage_2013
3669 // 9. Any-*/Gutgarts
3670 // 10. Any-*/Tekie_Alibekit
3671 // 11. Any-*/Xaleget
3672 //
3673 if ( id.compare((UnicodeString)"Devanagari-Arabic/") != 0
3674 && !(id.startsWith((UnicodeString)"Any-") &&
3675 (id.endsWith((UnicodeString)"/BGN") || id.endsWith((UnicodeString)"/BGN_1981") || id.endsWith((UnicodeString)"/MNS"))
3676 )
3677 #if UCONFIG_NO_BREAK_ITERATION
3678 && id.compare((UnicodeString)"Latin-Thai/") != 0
3679 #endif
3680 && !(logKnownIssue("21911", "ICU4C cannot create inverse of Any-Xxxx/Variant transform created from both-direction transform") &&
3681 id.startsWith((UnicodeString)"Any-") &&
3682 (id.endsWith((UnicodeString)"/UNGEGN") || id.startsWith((UnicodeString)"Any-Ethiopic/") || id.startsWith((UnicodeString)"Any-Braille/") ||
3683 id.endsWith((UnicodeString)"/Gurage_2013") || id.endsWith((UnicodeString)"/Gutgarts") || id.endsWith((UnicodeString)"/Tekie_Alibekit") ||
3684 id.endsWith((UnicodeString)"/Xaleget"))
3685 )
3686 )
3687 {
3688 errln((UnicodeString)"FAIL: Could not create inverse of " + id + ", status " + u_errorName(status));
3689 }
3690 delete t;
3691 delete inv;
3692 continue;
3693 }
3694 CheckIncrementalAux(inv, rev);
3695 delete t;
3696 delete inv;
3697 }
3698 }
3699 }
3700 }
3701
CheckIncrementalAux(const Transliterator * t,const UnicodeString & input)3702 void TransliteratorTest::CheckIncrementalAux(const Transliterator* t,
3703 const UnicodeString& input) {
3704 UErrorCode ec = U_ZERO_ERROR;
3705 UTransPosition pos;
3706 UnicodeString test = input;
3707
3708 pos.contextStart = 0;
3709 pos.contextLimit = input.length();
3710 pos.start = 0;
3711 pos.limit = input.length();
3712
3713 t->transliterate(test, pos, ec);
3714 if (U_FAILURE(ec)) {
3715 errln((UnicodeString)"FAIL: transliterate() error " + u_errorName(ec));
3716 return;
3717 }
3718 UBool gotError = false;
3719 (void)gotError; // Suppress set but not used warning.
3720
3721 // we have a few special cases. Any-Remove (pos.start = 0, but also = limit) and U+XXXXX?X?
3722
3723 if (pos.start == 0 && pos.limit != 0 && t->getID() != "Hex-Any/Unicode") {
3724 errln((UnicodeString)"No Progress, " +
3725 t->getID() + ": " + formatInput(test, input, pos));
3726 gotError = true;
3727 } else {
3728 logln((UnicodeString)"PASS Progress, " +
3729 t->getID() + ": " + formatInput(test, input, pos));
3730 }
3731 t->finishTransliteration(test, pos);
3732 if (pos.start != pos.limit) {
3733 errln((UnicodeString)"Incomplete, " +
3734 t->getID() + ": " + formatInput(test, input, pos));
3735 gotError = true;
3736 }
3737 }
3738
TestFunction()3739 void TransliteratorTest::TestFunction() {
3740 // Careful with spacing and ';' here: Phrase this exactly
3741 // as toRules() is going to return it. If toRules() changes
3742 // with regard to spacing or ';', then adjust this string.
3743 UnicodeString rule =
3744 "([:Lu:]) > $1 '(' &Lower( $1 ) '=' &Hex( &Any-Lower( $1 ) ) ')';";
3745
3746 UParseError pe;
3747 UErrorCode ec = U_ZERO_ERROR;
3748 Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3749 if (t == NULL) {
3750 dataerrln("FAIL: createFromRules failed - %s", u_errorName(ec));
3751 return;
3752 }
3753
3754 UnicodeString r;
3755 t->toRules(r, true);
3756 if (r == rule) {
3757 logln((UnicodeString)"OK: toRules() => " + r);
3758 } else {
3759 errln((UnicodeString)"FAIL: toRules() => " + r +
3760 ", expected " + rule);
3761 }
3762
3763 expect(*t, "The Quick Brown Fox",
3764 UNICODE_STRING_SIMPLE("T(t=\\u0074)he Q(q=\\u0071)uick B(b=\\u0062)rown F(f=\\u0066)ox"));
3765
3766 delete t;
3767 }
3768
TestInvalidBackRef(void)3769 void TransliteratorTest::TestInvalidBackRef(void) {
3770 UnicodeString rule = ". > $1;";
3771 UnicodeString rule2 =CharsToUnicodeString("(.) <> &hex/unicode($1) &name($1); . > $1; [{}] >\\u0020;");
3772 UParseError pe;
3773 UErrorCode ec = U_ZERO_ERROR;
3774 Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3775 Transliterator *t2 = Transliterator::createFromRules("Test2", rule2, UTRANS_FORWARD, pe, ec);
3776
3777 if (t != NULL) {
3778 errln("FAIL: createFromRules should have returned NULL");
3779 delete t;
3780 }
3781
3782 if (t2 != NULL) {
3783 errln("FAIL: createFromRules should have returned NULL");
3784 delete t2;
3785 }
3786
3787 if (U_SUCCESS(ec)) {
3788 errln("FAIL: Ok: . > $1; => no error");
3789 } else {
3790 logln((UnicodeString)"Ok: . > $1; => " + u_errorName(ec));
3791 }
3792 }
3793
TestMulticharStringSet()3794 void TransliteratorTest::TestMulticharStringSet() {
3795 // Basic testing
3796 const char* rule =
3797 " [{aa}] > x;"
3798 " a > y;"
3799 " [b{bc}] > z;"
3800 "[{gd}] { e > q;"
3801 " e } [{fg}] > r;" ;
3802
3803 UParseError pe;
3804 UErrorCode ec = U_ZERO_ERROR;
3805 Transliterator* t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3806 if (t == NULL || U_FAILURE(ec)) {
3807 delete t;
3808 errln("FAIL: createFromRules failed");
3809 return;
3810 }
3811
3812 expect(*t, "a aa ab bc d gd de gde gdefg ddefg",
3813 "y x yz z d gd de gdq gdqfg ddrfg");
3814 delete t;
3815
3816 // Overlapped string test. Make sure that when multiple
3817 // strings can match that the longest one is matched.
3818 rule =
3819 " [a {ab} {abc}] > x;"
3820 " b > y;"
3821 " c > z;"
3822 " q [t {st} {rst}] { e > p;" ;
3823
3824 t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3825 if (t == NULL || U_FAILURE(ec)) {
3826 delete t;
3827 errln("FAIL: createFromRules failed");
3828 return;
3829 }
3830
3831 expect(*t, "a ab abc qte qste qrste",
3832 "x x x qtp qstp qrstp");
3833 delete t;
3834 }
3835
3836 // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
3837 // BEGIN TestUserFunction support factory
3838
3839 Transliterator* _TUFF[4];
3840 UnicodeString* _TUFID[4];
3841
_TUFFactory(const UnicodeString &,Transliterator::Token context)3842 static Transliterator* U_EXPORT2 _TUFFactory(const UnicodeString& /*ID*/,
3843 Transliterator::Token context) {
3844 return _TUFF[context.integer]->clone();
3845 }
3846
_TUFReg(const UnicodeString & ID,Transliterator * t,int32_t n)3847 static void _TUFReg(const UnicodeString& ID, Transliterator* t, int32_t n) {
3848 _TUFF[n] = t;
3849 _TUFID[n] = new UnicodeString(ID);
3850 Transliterator::registerFactory(ID, _TUFFactory, Transliterator::integerToken(n));
3851 }
3852
_TUFUnreg(int32_t n)3853 static void _TUFUnreg(int32_t n) {
3854 if (_TUFF[n] != NULL) {
3855 Transliterator::unregister(*_TUFID[n]);
3856 delete _TUFF[n];
3857 delete _TUFID[n];
3858 }
3859 }
3860
3861 // END TestUserFunction support factory
3862 // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3863
3864 /**
3865 * Test that user-registered transliterators can be used under function
3866 * syntax.
3867 */
TestUserFunction()3868 void TransliteratorTest::TestUserFunction() {
3869
3870 Transliterator* t;
3871 UParseError pe;
3872 UErrorCode ec = U_ZERO_ERROR;
3873
3874 // Setup our factory
3875 int32_t i;
3876 for (i=0; i<4; ++i) {
3877 _TUFF[i] = NULL;
3878 }
3879
3880 // There's no need to register inverses if we don't use them
3881 t = Transliterator::createFromRules("gif",
3882 UNICODE_STRING_SIMPLE("'\\'u(..)(..) > '<img src=\"http://www.unicode.org/gifs/24/' $1 '/U' $1$2 '.gif\">';"),
3883 UTRANS_FORWARD, pe, ec);
3884 if (t == NULL || U_FAILURE(ec)) {
3885 dataerrln((UnicodeString)"FAIL: createFromRules gif " + u_errorName(ec));
3886 return;
3887 }
3888 _TUFReg("Any-gif", t, 0);
3889
3890 t = Transliterator::createFromRules("RemoveCurly",
3891 UNICODE_STRING_SIMPLE("[\\{\\}] > ; '\\N' > ;"),
3892 UTRANS_FORWARD, pe, ec);
3893 if (t == NULL || U_FAILURE(ec)) {
3894 errln((UnicodeString)"FAIL: createFromRules RemoveCurly " + u_errorName(ec));
3895 goto FAIL;
3896 }
3897 expect(*t, UNICODE_STRING_SIMPLE("\\N{name}"), "name");
3898 _TUFReg("Any-RemoveCurly", t, 1);
3899
3900 logln("Trying &hex");
3901 t = Transliterator::createFromRules("hex2",
3902 "(.) > &hex($1);",
3903 UTRANS_FORWARD, pe, ec);
3904 if (t == NULL || U_FAILURE(ec)) {
3905 errln("FAIL: createFromRules");
3906 goto FAIL;
3907 }
3908 logln("Registering");
3909 _TUFReg("Any-hex2", t, 2);
3910 t = Transliterator::createInstance("Any-hex2", UTRANS_FORWARD, ec);
3911 if (t == NULL || U_FAILURE(ec)) {
3912 errln((UnicodeString)"FAIL: createInstance Any-hex2 " + u_errorName(ec));
3913 goto FAIL;
3914 }
3915 expect(*t, "abc", UNICODE_STRING_SIMPLE("\\u0061\\u0062\\u0063"));
3916 delete t;
3917
3918 logln("Trying &gif");
3919 t = Transliterator::createFromRules("gif2",
3920 "(.) > &Gif(&Hex2($1));",
3921 UTRANS_FORWARD, pe, ec);
3922 if (t == NULL || U_FAILURE(ec)) {
3923 errln((UnicodeString)"FAIL: createFromRules gif2 " + u_errorName(ec));
3924 goto FAIL;
3925 }
3926 logln("Registering");
3927 _TUFReg("Any-gif2", t, 3);
3928 t = Transliterator::createInstance("Any-gif2", UTRANS_FORWARD, ec);
3929 if (t == NULL || U_FAILURE(ec)) {
3930 errln((UnicodeString)"FAIL: createInstance Any-gif2 " + u_errorName(ec));
3931 goto FAIL;
3932 }
3933 expect(*t, "ab", "<img src=\"http://www.unicode.org/gifs/24/00/U0061.gif\">"
3934 "<img src=\"http://www.unicode.org/gifs/24/00/U0062.gif\">");
3935 delete t;
3936
3937 // Test that filters are allowed after &
3938 t = Transliterator::createFromRules("test",
3939 "(.) > &Hex($1) ' ' &RemoveCurly(&Name($1)) ' ';",
3940 UTRANS_FORWARD, pe, ec);
3941 if (t == NULL || U_FAILURE(ec)) {
3942 errln((UnicodeString)"FAIL: createFromRules test " + u_errorName(ec));
3943 goto FAIL;
3944 }
3945 expect(*t, "abc",
3946 UNICODE_STRING_SIMPLE("\\u0061 LATIN SMALL LETTER A \\u0062 LATIN SMALL LETTER B \\u0063 LATIN SMALL LETTER C "));
3947 delete t;
3948
3949 FAIL:
3950 for (i=0; i<4; ++i) {
3951 _TUFUnreg(i);
3952 }
3953 }
3954
3955 /**
3956 * Test the Any-X transliterators.
3957 */
TestAnyX(void)3958 void TransliteratorTest::TestAnyX(void) {
3959 UParseError parseError;
3960 UErrorCode status = U_ZERO_ERROR;
3961 Transliterator* anyLatin =
3962 Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
3963 if (anyLatin==0) {
3964 dataerrln("FAIL: createInstance returned NULL - %s", u_errorName(status));
3965 delete anyLatin;
3966 return;
3967 }
3968
3969 expect(*anyLatin,
3970 CharsToUnicodeString("greek:\\u03B1\\u03B2\\u03BA\\u0391\\u0392\\u039A hiragana:\\u3042\\u3076\\u304F cyrillic:\\u0430\\u0431\\u0446"),
3971 CharsToUnicodeString("greek:abkABK hiragana:abuku cyrillic:abc"));
3972
3973 delete anyLatin;
3974 }
3975
3976 /**
3977 * Test Any-X transliterators with sample letters from all scripts.
3978 */
TestAny(void)3979 void TransliteratorTest::TestAny(void) {
3980 UErrorCode status = U_ZERO_ERROR;
3981 // Note: there is a lot of implicit construction of UnicodeStrings from (char *) in
3982 // function call parameters going on in this test.
3983 UnicodeSet alphabetic("[:alphabetic:]", status);
3984 if (U_FAILURE(status)) {
3985 dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
3986 return;
3987 }
3988 alphabetic.freeze();
3989
3990 UnicodeString testString;
3991 for (int32_t i = 0; i < USCRIPT_CODE_LIMIT; i++) {
3992 const char *scriptName = uscript_getShortName((UScriptCode)i);
3993 if (scriptName == NULL) {
3994 errln("Failure: file %s, line %d: Script Code %d is invalid, ", __FILE__, __LINE__, i);
3995 return;
3996 }
3997
3998 UnicodeSet sample;
3999 sample.applyPropertyAlias("script", scriptName, status);
4000 if (U_FAILURE(status)) {
4001 errln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
4002 return;
4003 }
4004 sample.retainAll(alphabetic);
4005 for (int32_t count=0; count<5; count++) {
4006 UChar32 c = sample.charAt(count);
4007 if (c == -1) {
4008 break;
4009 }
4010 testString.append(c);
4011 }
4012 }
4013
4014 UParseError parseError;
4015 Transliterator* anyLatin =
4016 Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
4017 if (U_FAILURE(status)) {
4018 dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
4019 return;
4020 }
4021
4022 logln(UnicodeString("Sample set for Any-Latin: ") + testString);
4023 anyLatin->transliterate(testString);
4024 logln(UnicodeString("Sample result for Any-Latin: ") + testString);
4025 delete anyLatin;
4026 }
4027
4028
4029 /**
4030 * Test the source and target set API. These are only implemented
4031 * for RBT and CompoundTransliterator at this time.
4032 */
TestSourceTargetSet()4033 void TransliteratorTest::TestSourceTargetSet() {
4034 UErrorCode ec = U_ZERO_ERROR;
4035
4036 // Rules
4037 const char* r =
4038 "a > b; "
4039 "r [x{lu}] > q;";
4040
4041 // Expected source
4042 UnicodeSet expSrc("[arx{lu}]", ec);
4043
4044 // Expected target
4045 UnicodeSet expTrg("[bq]", ec);
4046
4047 UParseError pe;
4048 Transliterator* t = Transliterator::createFromRules("test", r, UTRANS_FORWARD, pe, ec);
4049
4050 if (U_FAILURE(ec)) {
4051 delete t;
4052 errln("FAIL: Couldn't set up test");
4053 return;
4054 }
4055
4056 UnicodeSet src; t->getSourceSet(src);
4057 UnicodeSet trg; t->getTargetSet(trg);
4058
4059 if (src == expSrc && trg == expTrg) {
4060 UnicodeString a, b;
4061 logln((UnicodeString)"Ok: " +
4062 r + " => source = " + src.toPattern(a, true) +
4063 ", target = " + trg.toPattern(b, true));
4064 } else {
4065 UnicodeString a, b, c, d;
4066 errln((UnicodeString)"FAIL: " +
4067 r + " => source = " + src.toPattern(a, true) +
4068 ", expected " + expSrc.toPattern(b, true) +
4069 "; target = " + trg.toPattern(c, true) +
4070 ", expected " + expTrg.toPattern(d, true));
4071 }
4072
4073 delete t;
4074 }
4075
4076 /**
4077 * Test handling of Pattern_White_Space, for both RBT and UnicodeSet.
4078 */
TestPatternWhiteSpace()4079 void TransliteratorTest::TestPatternWhiteSpace() {
4080 // Rules
4081 const char* r = "a > \\u200E b;";
4082
4083 UErrorCode ec = U_ZERO_ERROR;
4084 UParseError pe;
4085 Transliterator* t = Transliterator::createFromRules("test", CharsToUnicodeString(r), UTRANS_FORWARD, pe, ec);
4086
4087 if (U_FAILURE(ec)) {
4088 errln("FAIL: Couldn't set up test");
4089 } else {
4090 expect(*t, "a", "b");
4091 }
4092 delete t;
4093
4094 // UnicodeSet
4095 ec = U_ZERO_ERROR;
4096 UnicodeSet set(CharsToUnicodeString("[a \\u200E]"), ec);
4097
4098 if (U_FAILURE(ec)) {
4099 errln("FAIL: Couldn't set up test");
4100 } else {
4101 if (set.contains(0x200E)) {
4102 errln("FAIL: U+200E not being ignored by UnicodeSet");
4103 }
4104 }
4105 }
4106 //======================================================================
4107 // this method is in TestUScript.java
4108 //======================================================================
TestAllCodepoints()4109 void TransliteratorTest::TestAllCodepoints(){
4110 UScriptCode code= USCRIPT_INVALID_CODE;
4111 char id[256]={'\0'};
4112 char abbr[256]={'\0'};
4113 char newId[256]={'\0'};
4114 char newAbbrId[256]={'\0'};
4115 char oldId[256]={'\0'};
4116 char oldAbbrId[256]={'\0'};
4117
4118 UErrorCode status =U_ZERO_ERROR;
4119 UParseError pe;
4120
4121 for(uint32_t i = 0; i<=0x10ffff; i++){
4122 code = uscript_getScript(i,&status);
4123 if(code == USCRIPT_INVALID_CODE){
4124 dataerrln("uscript_getScript for codepoint \\U%08X failed.", i);
4125 }
4126 const char* myId = uscript_getName(code);
4127 if(!myId) {
4128 dataerrln("Valid script code returned NULL name. Check your data!");
4129 return;
4130 }
4131 uprv_strcpy(id,myId);
4132 uprv_strcpy(abbr,uscript_getShortName(code));
4133
4134 uprv_strcpy(newId,"[:");
4135 uprv_strcat(newId,id);
4136 uprv_strcat(newId,":];NFD");
4137
4138 uprv_strcpy(newAbbrId,"[:");
4139 uprv_strcat(newAbbrId,abbr);
4140 uprv_strcat(newAbbrId,":];NFD");
4141
4142 if(uprv_strcmp(newId,oldId)!=0){
4143 Transliterator* t = Transliterator::createInstance(newId,UTRANS_FORWARD,pe,status);
4144 if(t==NULL || U_FAILURE(status)){
4145 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4146 }
4147 delete t;
4148 }
4149 if(uprv_strcmp(newAbbrId,oldAbbrId)!=0){
4150 Transliterator* t = Transliterator::createInstance(newAbbrId,UTRANS_FORWARD,pe,status);
4151 if(t==NULL || U_FAILURE(status)){
4152 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4153 }
4154 delete t;
4155 }
4156 uprv_strcpy(oldId,newId);
4157 uprv_strcpy(oldAbbrId, newAbbrId);
4158
4159 }
4160
4161 }
4162
4163 #define TEST_TRANSLIT_ID(id, cls) UPRV_BLOCK_MACRO_BEGIN { \
4164 UErrorCode ec = U_ZERO_ERROR; \
4165 Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); \
4166 if (U_FAILURE(ec)) { \
4167 dataerrln("FAIL: Couldn't create %s - %s", id, u_errorName(ec)); \
4168 } else { \
4169 if (t->getDynamicClassID() != cls::getStaticClassID()) { \
4170 errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4171 } \
4172 /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4173 } \
4174 delete t; \
4175 } UPRV_BLOCK_MACRO_END
4176
4177 #define TEST_TRANSLIT_RULE(rule, cls) UPRV_BLOCK_MACRO_BEGIN { \
4178 UErrorCode ec = U_ZERO_ERROR; \
4179 UParseError pe; \
4180 Transliterator* t = Transliterator::createFromRules("_", rule, UTRANS_FORWARD, pe, ec); \
4181 if (U_FAILURE(ec)) { \
4182 errln("FAIL: Couldn't create " rule); \
4183 } else { \
4184 if (t->getDynamicClassID() != cls ::getStaticClassID()) { \
4185 errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4186 } \
4187 /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4188 } \
4189 delete t; \
4190 } UPRV_BLOCK_MACRO_END
4191
TestBoilerplate()4192 void TransliteratorTest::TestBoilerplate() {
4193 TEST_TRANSLIT_ID("Any-Latin", AnyTransliterator);
4194 TEST_TRANSLIT_ID("Any-Hex", EscapeTransliterator);
4195 TEST_TRANSLIT_ID("Hex-Any", UnescapeTransliterator);
4196 TEST_TRANSLIT_ID("Lower", LowercaseTransliterator);
4197 TEST_TRANSLIT_ID("Upper", UppercaseTransliterator);
4198 TEST_TRANSLIT_ID("Title", TitlecaseTransliterator);
4199 TEST_TRANSLIT_ID("Null", NullTransliterator);
4200 TEST_TRANSLIT_ID("Remove", RemoveTransliterator);
4201 TEST_TRANSLIT_ID("Any-Name", UnicodeNameTransliterator);
4202 TEST_TRANSLIT_ID("Name-Any", NameUnicodeTransliterator);
4203 TEST_TRANSLIT_ID("NFD", NormalizationTransliterator);
4204 TEST_TRANSLIT_ID("Latin-Greek", CompoundTransliterator);
4205 TEST_TRANSLIT_RULE("a>b;", RuleBasedTransliterator);
4206 }
4207
TestAlternateSyntax()4208 void TransliteratorTest::TestAlternateSyntax() {
4209 // U+2206 == &
4210 // U+2190 == <
4211 // U+2192 == >
4212 // U+2194 == <>
4213 expect(CharsToUnicodeString("a \\u2192 x; b \\u2190 y; c \\u2194 z"),
4214 "abc",
4215 "xbz");
4216 expect(CharsToUnicodeString("([:^ASCII:]) \\u2192 \\u2206Name($1);"),
4217 CharsToUnicodeString("<=\\u2190; >=\\u2192; <>=\\u2194; &=\\u2206"),
4218 UNICODE_STRING_SIMPLE("<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}"));
4219 }
4220
4221 static const char* BEGIN_END_RULES[] = {
4222 // [0]
4223 "abc > xy;"
4224 "aba > z;",
4225
4226 // [1]
4227 /*
4228 "::BEGIN;"
4229 "abc > xy;"
4230 "::END;"
4231 "::BEGIN;"
4232 "aba > z;"
4233 "::END;",
4234 */
4235 "", // test case commented out below, this is here to keep from messing up the indexes
4236
4237 // [2]
4238 /*
4239 "abc > xy;"
4240 "::BEGIN;"
4241 "aba > z;"
4242 "::END;",
4243 */
4244 "", // test case commented out below, this is here to keep from messing up the indexes
4245
4246 // [3]
4247 /*
4248 "::BEGIN;"
4249 "abc > xy;"
4250 "::END;"
4251 "aba > z;",
4252 */
4253 "", // test case commented out below, this is here to keep from messing up the indexes
4254
4255 // [4]
4256 "abc > xy;"
4257 "::Null;"
4258 "aba > z;",
4259
4260 // [5]
4261 "::Upper;"
4262 "ABC > xy;"
4263 "AB > x;"
4264 "C > z;"
4265 "::Upper;"
4266 "XYZ > p;"
4267 "XY > q;"
4268 "Z > r;"
4269 "::Upper;",
4270
4271 // [6]
4272 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4273 "$delim = [\\-$ws];"
4274 "$ws $delim* > ' ';"
4275 "'-' $delim* > '-';",
4276
4277 // [7]
4278 "::Null;"
4279 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4280 "$delim = [\\-$ws];"
4281 "$ws $delim* > ' ';"
4282 "'-' $delim* > '-';",
4283
4284 // [8]
4285 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4286 "$delim = [\\-$ws];"
4287 "$ws $delim* > ' ';"
4288 "'-' $delim* > '-';"
4289 "::Null;",
4290
4291 // [9]
4292 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4293 "$delim = [\\-$ws];"
4294 "::Null;"
4295 "$ws $delim* > ' ';"
4296 "'-' $delim* > '-';",
4297
4298 // [10]
4299 /*
4300 "::BEGIN;"
4301 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4302 "$delim = [\\-$ws];"
4303 "::END;"
4304 "$ws $delim* > ' ';"
4305 "'-' $delim* > '-';",
4306 */
4307 "", // test case commented out below, this is here to keep from messing up the indexes
4308
4309 // [11]
4310 /*
4311 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4312 "$delim = [\\-$ws];"
4313 "::BEGIN;"
4314 "$ws $delim* > ' ';"
4315 "'-' $delim* > '-';"
4316 "::END;",
4317 */
4318 "", // test case commented out below, this is here to keep from messing up the indexes
4319
4320 // [12]
4321 /*
4322 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4323 "$delim = [\\-$ws];"
4324 "$ab = [ab];"
4325 "::BEGIN;"
4326 "$ws $delim* > ' ';"
4327 "'-' $delim* > '-';"
4328 "::END;"
4329 "::BEGIN;"
4330 "$ab { ' ' } $ab > '-';"
4331 "c { ' ' > ;"
4332 "::END;"
4333 "::BEGIN;"
4334 "'a-a' > a\\%|a;"
4335 "::END;",
4336 */
4337 "", // test case commented out below, this is here to keep from messing up the indexes
4338
4339 // [13]
4340 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4341 "$delim = [\\-$ws];"
4342 "$ab = [ab];"
4343 "::Null;"
4344 "$ws $delim* > ' ';"
4345 "'-' $delim* > '-';"
4346 "::Null;"
4347 "$ab { ' ' } $ab > '-';"
4348 "c { ' ' > ;"
4349 "::Null;"
4350 "'a-a' > a\\%|a;",
4351
4352 // [14]
4353 /*
4354 "::[abc];"
4355 "::BEGIN;"
4356 "abc > xy;"
4357 "::END;"
4358 "::BEGIN;"
4359 "aba > yz;"
4360 "::END;"
4361 "::Upper;",
4362 */
4363 "", // test case commented out below, this is here to keep from messing up the indexes
4364
4365 // [15]
4366 "::[abc];"
4367 "abc > xy;"
4368 "::Null;"
4369 "aba > yz;"
4370 "::Upper;",
4371
4372 // [16]
4373 /*
4374 "::[abc];"
4375 "::BEGIN;"
4376 "abc <> xy;"
4377 "::END;"
4378 "::BEGIN;"
4379 "aba <> yz;"
4380 "::END;"
4381 "::Upper(Lower);"
4382 "::([XYZ]);"
4383 */
4384 "", // test case commented out below, this is here to keep from messing up the indexes
4385
4386 // [17]
4387 "::[abc];"
4388 "abc <> xy;"
4389 "::Null;"
4390 "aba <> yz;"
4391 "::Upper(Lower);"
4392 "::([XYZ]);"
4393 };
4394
4395 /*
4396 (This entire test is commented out below and will need some heavy revision when we re-add
4397 the ::BEGIN/::END stuff)
4398 static const char* BOGUS_BEGIN_END_RULES[] = {
4399 // [7]
4400 "::BEGIN;"
4401 "abc > xy;"
4402 "::BEGIN;"
4403 "aba > z;"
4404 "::END;"
4405 "::END;",
4406
4407 // [8]
4408 "abc > xy;"
4409 " aba > z;"
4410 "::END;",
4411
4412 // [9]
4413 "::BEGIN;"
4414 "::Upper;"
4415 "::END;"
4416 };
4417 static const int32_t BOGUS_BEGIN_END_RULES_length = UPRV_LENGTHOF(BOGUS_BEGIN_END_RULES);
4418 */
4419
4420 static const char* BEGIN_END_TEST_CASES[] = {
4421 // rules input expected output
4422 BEGIN_END_RULES[0], "abc ababc aba", "xy zbc z",
4423 // BEGIN_END_RULES[1], "abc ababc aba", "xy abxy z",
4424 // BEGIN_END_RULES[2], "abc ababc aba", "xy abxy z",
4425 // BEGIN_END_RULES[3], "abc ababc aba", "xy abxy z",
4426 BEGIN_END_RULES[4], "abc ababc aba", "xy abxy z",
4427 BEGIN_END_RULES[5], "abccabaacababcbc", "PXAARXQBR",
4428
4429 BEGIN_END_RULES[6], "e e - e---e- e", "e e e-e-e",
4430 BEGIN_END_RULES[7], "e e - e---e- e", "e e e-e-e",
4431 BEGIN_END_RULES[8], "e e - e---e- e", "e e e-e-e",
4432 BEGIN_END_RULES[9], "e e - e---e- e", "e e e-e-e",
4433 // BEGIN_END_RULES[10], "e e - e---e- e", "e e e-e-e",
4434 // BEGIN_END_RULES[11], "e e - e---e- e", "e e e-e-e",
4435 // BEGIN_END_RULES[12], "e e - e---e- e", "e e e-e-e",
4436 // BEGIN_END_RULES[12], "a a a a", "a%a%a%a",
4437 // BEGIN_END_RULES[12], "a a-b c b a", "a%a-b cb-a",
4438 BEGIN_END_RULES[13], "e e - e---e- e", "e e e-e-e",
4439 BEGIN_END_RULES[13], "a a a a", "a%a%a%a",
4440 BEGIN_END_RULES[13], "a a-b c b a", "a%a-b cb-a",
4441
4442 // BEGIN_END_RULES[14], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4443 BEGIN_END_RULES[15], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4444 // BEGIN_END_RULES[16], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4445 BEGIN_END_RULES[17], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ"
4446 };
4447 static const int32_t BEGIN_END_TEST_CASES_length = UPRV_LENGTHOF(BEGIN_END_TEST_CASES);
4448
TestBeginEnd()4449 void TransliteratorTest::TestBeginEnd() {
4450 // run through the list of test cases above
4451 int32_t i = 0;
4452 for (i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4453 expect((UnicodeString)"Test case #" + (i / 3),
4454 UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4455 UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4456 UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4457 }
4458
4459 // instantiate the one reversible rule set in the reverse direction and make sure it does the right thing
4460 UParseError parseError;
4461 UErrorCode status = U_ZERO_ERROR;
4462 Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4463 UTRANS_REVERSE, parseError, status);
4464 if (reversed == 0 || U_FAILURE(status)) {
4465 reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4466 } else {
4467 expect(*reversed, UnicodeString("xy XY XYZ yz YZ"), UnicodeString("xy abc xaba yz aba"));
4468 }
4469 delete reversed;
4470
4471 // finally, run through the list of syntactically-ill-formed rule sets above and make sure
4472 // that all of them cause errors
4473 /*
4474 (commented out until we have the real ::BEGIN/::END stuff in place
4475 for (i = 0; i < BOGUS_BEGIN_END_RULES_length; i++) {
4476 UParseError parseError;
4477 UErrorCode status = U_ZERO_ERROR;
4478 Transliterator* t = Transliterator::createFromRules("foo", UnicodeString(BOGUS_BEGIN_END_RULES[i]),
4479 UTRANS_FORWARD, parseError, status);
4480 if (!U_FAILURE(status)) {
4481 delete t;
4482 errln((UnicodeString)"Should have gotten syntax error from " + BOGUS_BEGIN_END_RULES[i]);
4483 }
4484 }
4485 */
4486 }
4487
TestBeginEndToRules()4488 void TransliteratorTest::TestBeginEndToRules() {
4489 // run through the same list of test cases we used above, but this time, instead of just
4490 // instantiating a Transliterator from the rules and running the test against it, we instantiate
4491 // a Transliterator from the rules, do toRules() on it, instantiate a Transliterator from
4492 // the resulting set of rules, and make sure that the generated rule set is semantically equivalent
4493 // to (i.e., does the same thing as) the original rule set
4494 for (int32_t i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4495 UParseError parseError;
4496 UErrorCode status = U_ZERO_ERROR;
4497 Transliterator* t = Transliterator::createFromRules("--", UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4498 UTRANS_FORWARD, parseError, status);
4499 if (U_FAILURE(status)) {
4500 reportParseError(UnicodeString("FAIL: Couldn't create transliterator"), parseError, status);
4501 } else {
4502 UnicodeString rules;
4503 t->toRules(rules, true);
4504 Transliterator* t2 = Transliterator::createFromRules((UnicodeString)"Test case #" + (i / 3), rules,
4505 UTRANS_FORWARD, parseError, status);
4506 if (U_FAILURE(status)) {
4507 reportParseError(UnicodeString("FAIL: Couldn't create transliterator from generated rules"),
4508 parseError, status);
4509 delete t;
4510 } else {
4511 expect(*t2,
4512 UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4513 UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4514 delete t;
4515 delete t2;
4516 }
4517 }
4518 }
4519
4520 // do the same thing for the reversible test case
4521 UParseError parseError;
4522 UErrorCode status = U_ZERO_ERROR;
4523 Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4524 UTRANS_REVERSE, parseError, status);
4525 if (U_FAILURE(status)) {
4526 reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4527 } else {
4528 UnicodeString rules;
4529 reversed->toRules(rules, false);
4530 Transliterator* reversed2 = Transliterator::createFromRules("Reversed", rules, UTRANS_FORWARD,
4531 parseError, status);
4532 if (U_FAILURE(status)) {
4533 reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator from generated rules"),
4534 parseError, status);
4535 delete reversed;
4536 } else {
4537 expect(*reversed2,
4538 UnicodeString("xy XY XYZ yz YZ"),
4539 UnicodeString("xy abc xaba yz aba"));
4540 delete reversed;
4541 delete reversed2;
4542 }
4543 }
4544 }
4545
TestRegisterAlias()4546 void TransliteratorTest::TestRegisterAlias() {
4547 UnicodeString longID("Lower;[aeiou]Upper");
4548 UnicodeString shortID("Any-CapVowels");
4549 UnicodeString reallyShortID("CapVowels");
4550
4551 Transliterator::registerAlias(shortID, longID);
4552
4553 UErrorCode err = U_ZERO_ERROR;
4554 Transliterator* t1 = Transliterator::createInstance(longID, UTRANS_FORWARD, err);
4555 if (U_FAILURE(err)) {
4556 errln("Failed to instantiate transliterator with long ID");
4557 Transliterator::unregister(shortID);
4558 return;
4559 }
4560 Transliterator* t2 = Transliterator::createInstance(reallyShortID, UTRANS_FORWARD, err);
4561 if (U_FAILURE(err)) {
4562 errln("Failed to instantiate transliterator with short ID");
4563 delete t1;
4564 Transliterator::unregister(shortID);
4565 return;
4566 }
4567
4568 if (t1->getID() != longID)
4569 errln("Transliterator instantiated with long ID doesn't have long ID");
4570 if (t2->getID() != reallyShortID)
4571 errln("Transliterator instantiated with short ID doesn't have short ID");
4572
4573 UnicodeString rules1;
4574 UnicodeString rules2;
4575
4576 t1->toRules(rules1, true);
4577 t2->toRules(rules2, true);
4578 if (rules1 != rules2)
4579 errln("Alias transliterators aren't the same");
4580
4581 delete t1;
4582 delete t2;
4583 Transliterator::unregister(shortID);
4584
4585 t1 = Transliterator::createInstance(shortID, UTRANS_FORWARD, err);
4586 if (U_SUCCESS(err)) {
4587 errln("Instantiation with short ID succeeded after short ID was unregistered");
4588 delete t1;
4589 }
4590
4591 // try the same thing again, but this time with something other than
4592 // an instance of CompoundTransliterator
4593 UnicodeString realID("Latin-Greek");
4594 UnicodeString fakeID("Latin-dlgkjdflkjdl");
4595 Transliterator::registerAlias(fakeID, realID);
4596
4597 err = U_ZERO_ERROR;
4598 t1 = Transliterator::createInstance(realID, UTRANS_FORWARD, err);
4599 if (U_FAILURE(err)) {
4600 dataerrln("Failed to instantiate transliterator with real ID - %s", u_errorName(err));
4601 Transliterator::unregister(realID);
4602 return;
4603 }
4604 t2 = Transliterator::createInstance(fakeID, UTRANS_FORWARD, err);
4605 if (U_FAILURE(err)) {
4606 errln("Failed to instantiate transliterator with fake ID");
4607 delete t1;
4608 Transliterator::unregister(realID);
4609 return;
4610 }
4611
4612 t1->toRules(rules1, true);
4613 t2->toRules(rules2, true);
4614 if (rules1 != rules2)
4615 errln("Alias transliterators aren't the same");
4616
4617 delete t1;
4618 delete t2;
4619 Transliterator::unregister(fakeID);
4620 }
4621
TestRuleStripping()4622 void TransliteratorTest::TestRuleStripping() {
4623 /*
4624 #
4625 \uE001>\u0C01; # SIGN
4626 */
4627 static const UChar rule[] = {
4628 0x0023,0x0020,0x000D,0x000A,
4629 0xE001,0x003E,0x0C01,0x003B,0x0020,0x0023,0x0020,0x0053,0x0049,0x0047,0x004E,0
4630 };
4631 static const UChar expectedRule[] = {
4632 0xE001,0x003E,0x0C01,0x003B,0
4633 };
4634 UChar result[UPRV_LENGTHOF(rule)];
4635 UErrorCode status = U_ZERO_ERROR;
4636 int32_t len = utrans_stripRules(rule, UPRV_LENGTHOF(rule), result, &status);
4637 if (len != u_strlen(expectedRule)) {
4638 errln("utrans_stripRules return len = %d", len);
4639 }
4640 if (u_strncmp(expectedRule, result, len) != 0) {
4641 errln("utrans_stripRules did not return expected string");
4642 }
4643 }
4644
4645 /**
4646 * Test the Halfwidth-Fullwidth transliterator (ticket 6281).
4647 */
TestHalfwidthFullwidth(void)4648 void TransliteratorTest::TestHalfwidthFullwidth(void) {
4649 UParseError parseError;
4650 UErrorCode status = U_ZERO_ERROR;
4651 Transliterator* hf = Transliterator::createInstance("Halfwidth-Fullwidth", UTRANS_FORWARD, parseError, status);
4652 Transliterator* fh = Transliterator::createInstance("Fullwidth-Halfwidth", UTRANS_FORWARD, parseError, status);
4653 if (hf == 0 || fh == 0) {
4654 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4655 delete hf;
4656 delete fh;
4657 return;
4658 }
4659
4660 // Array of 2n items
4661 // Each item is
4662 // "hf"|"fh"|"both",
4663 // <Halfwidth>,
4664 // <Fullwidth>
4665 const char* DATA[] = {
4666 "both",
4667 "\\uFFE9\\uFFEA\\uFFEB\\uFFEC\\u0061\\uFF71\\u00AF\\u0020",
4668 "\\u2190\\u2191\\u2192\\u2193\\uFF41\\u30A2\\uFFE3\\u3000",
4669 };
4670 int32_t DATA_length = UPRV_LENGTHOF(DATA);
4671
4672 for (int32_t i=0; i<DATA_length; i+=3) {
4673 UnicodeString h = CharsToUnicodeString(DATA[i+1]);
4674 UnicodeString f = CharsToUnicodeString(DATA[i+2]);
4675 switch (*DATA[i]) {
4676 case 0x68: //'h': // Halfwidth-Fullwidth only
4677 expect(*hf, h, f);
4678 break;
4679 case 0x66: //'f': // Fullwidth-Halfwidth only
4680 expect(*fh, f, h);
4681 break;
4682 case 0x62: //'b': // both directions
4683 expect(*hf, h, f);
4684 expect(*fh, f, h);
4685 break;
4686 }
4687 }
4688 delete hf;
4689 delete fh;
4690 }
4691
4692
4693 /**
4694 * Test Thai. The text is the first paragraph of "What is Unicode" from the Unicode.org web site.
4695 * TODO: confirm that the expected results are correct.
4696 * For now, test just confirms that C++ and Java give identical results.
4697 */
TestThai(void)4698 void TransliteratorTest::TestThai(void) {
4699 #if !UCONFIG_NO_BREAK_ITERATION
4700 // The expectations in this test heavily depends on the Thai dictionary.
4701 // Therefore, we skip this test under the LSTM configuration.
4702 if (skipDictionaryTest()) {
4703 return;
4704 }
4705 UParseError parseError;
4706 UErrorCode status = U_ZERO_ERROR;
4707 Transliterator* tr = Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
4708 if (tr == 0) {
4709 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4710 return;
4711 }
4712 if (U_FAILURE(status)) {
4713 errln("FAIL: createInstance failed with %s", u_errorName(status));
4714 return;
4715 }
4716 const char *thaiText =
4717 "\\u0e42\\u0e14\\u0e22\\u0e1e\\u0e37\\u0e49\\u0e19\\u0e10\\u0e32\\u0e19\\u0e41\\u0e25\\u0e49\\u0e27, \\u0e04\\u0e2d"
4718 "\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d\\u0e23\\u0e4c\\u0e08\\u0e30\\u0e40\\u0e01\\u0e35\\u0e48\\u0e22"
4719 "\\u0e27\\u0e02\\u0e49\\u0e2d\\u0e07\\u0e01\\u0e31\\u0e1a\\u0e40\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e02\\u0e2d"
4720 "\\u0e07\\u0e15\\u0e31\\u0e27\\u0e40\\u0e25\\u0e02. \\u0e04\\u0e2d\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d"
4721 "\\u0e23\\u0e4c\\u0e08\\u0e31\\u0e14\\u0e40\\u0e01\\u0e47\\u0e1a\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29"
4722 "\\u0e23\\u0e41\\u0e25\\u0e30\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30\\u0e2d\\u0e37\\u0e48\\u0e19\\u0e46 \\u0e42"
4723 "\\u0e14\\u0e22\\u0e01\\u0e32\\u0e23\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25"
4724 "\\u0e02\\u0e43\\u0e2b\\u0e49\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e41\\u0e15\\u0e48\\u0e25\\u0e30\\u0e15"
4725 "\\u0e31\\u0e27. \\u0e01\\u0e48\\u0e2d\\u0e19\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48\\u0e4a Unicode \\u0e08"
4726 "\\u0e30\\u0e16\\u0e39\\u0e01\\u0e2a\\u0e23\\u0e49\\u0e32\\u0e07\\u0e02\\u0e36\\u0e49\\u0e19, \\u0e44\\u0e14\\u0e49"
4727 "\\u0e21\\u0e35\\u0e23\\u0e30\\u0e1a\\u0e1a encoding \\u0e2d\\u0e22\\u0e39\\u0e48\\u0e2b\\u0e25\\u0e32\\u0e22\\u0e23"
4728 "\\u0e49\\u0e2d\\u0e22\\u0e23\\u0e30\\u0e1a\\u0e1a\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e01\\u0e32\\u0e23"
4729 "\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25\\u0e02\\u0e40\\u0e2b\\u0e25\\u0e48"
4730 "\\u0e32\\u0e19\\u0e35\\u0e49. \\u0e44\\u0e21\\u0e48\\u0e21\\u0e35 encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48"
4731 "\\u0e21\\u0e35\\u0e08\\u0e33\\u0e19\\u0e27\\u0e19\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30"
4732 "\\u0e21\\u0e32\\u0e01\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d: \\u0e22\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d"
4733 "\\u0e22\\u0e48\\u0e32\\u0e07\\u0e40\\u0e0a\\u0e48\\u0e19, \\u0e40\\u0e09\\u0e1e\\u0e32\\u0e30\\u0e43\\u0e19\\u0e01"
4734 "\\u0e25\\u0e38\\u0e48\\u0e21\\u0e2a\\u0e2b\\u0e20\\u0e32\\u0e1e\\u0e22\\u0e38\\u0e42\\u0e23\\u0e1b\\u0e40\\u0e1e"
4735 "\\u0e35\\u0e22\\u0e07\\u0e41\\u0e2b\\u0e48\\u0e07\\u0e40\\u0e14\\u0e35\\u0e22\\u0e27 \\u0e01\\u0e47\\u0e15\\u0e49"
4736 "\\u0e2d\\u0e07\\u0e01\\u0e32\\u0e23\\u0e2b\\u0e25\\u0e32\\u0e22 encoding \\u0e43\\u0e19\\u0e01\\u0e32\\u0e23\\u0e04"
4737 "\\u0e23\\u0e2d\\u0e1a\\u0e04\\u0e25\\u0e38\\u0e21\\u0e17\\u0e38\\u0e01\\u0e20\\u0e32\\u0e29\\u0e32\\u0e43\\u0e19"
4738 "\\u0e01\\u0e25\\u0e38\\u0e48\\u0e21. \\u0e2b\\u0e23\\u0e37\\u0e2d\\u0e41\\u0e21\\u0e49\\u0e41\\u0e15\\u0e48\\u0e43"
4739 "\\u0e19\\u0e20\\u0e32\\u0e29\\u0e32\\u0e40\\u0e14\\u0e35\\u0e48\\u0e22\\u0e27 \\u0e40\\u0e0a\\u0e48\\u0e19 \\u0e20"
4740 "\\u0e32\\u0e29\\u0e32\\u0e2d\\u0e31\\u0e07\\u0e01\\u0e24\\u0e29 \\u0e01\\u0e47\\u0e44\\u0e21\\u0e48\\u0e21\\u0e35"
4741 " encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d\\u0e2a\\u0e33\\u0e2b"
4742 "\\u0e23\\u0e31\\u0e1a\\u0e17\\u0e38\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29\\u0e23, \\u0e40\\u0e04"
4743 "\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e27\\u0e23\\u0e23\\u0e04\\u0e15\\u0e2d\\u0e19"
4744 " \\u0e41\\u0e25\\u0e30\\u0e2a\\u0e31\\u0e0d\\u0e25\\u0e31\\u0e01\\u0e29\\u0e13\\u0e4c\\u0e17\\u0e32\\u0e07\\u0e40"
4745 "\\u0e17\\u0e04\\u0e19\\u0e34\\u0e04\\u0e17\\u0e35\\u0e48\\u0e43\\u0e0a\\u0e49\\u0e01\\u0e31\\u0e19\\u0e2d\\u0e22"
4746 "\\u0e39\\u0e48\\u0e17\\u0e31\\u0e48\\u0e27\\u0e44\\u0e1b.";
4747
4748 const char *latinText =
4749 "doy ph\\u1ee5\\u0304\\u0302n \\u1e6d\\u0304h\\u0101n l\\u00e6\\u0302w, khxmphiwtexr\\u0312 ca ke\\u012b\\u0300"
4750 "ywk\\u0304\\u0125xng k\\u1ea1b re\\u1ee5\\u0304\\u0300xng k\\u0304hxng t\\u1ea1wlek\\u0304h. khxmphiwtexr"
4751 "\\u0312 c\\u1ea1d k\\u0115b t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r l\\u00e6a x\\u1ea1kk\\u0304h ra x\\u1ee5\\u0304"
4752 "\\u0300n\\u00ab doy k\\u0101r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304\\u0131\\u0302 s\\u0304"
4753 "\\u1ea3h\\u0304r\\u1ea1b t\\u00e6\\u0300la t\\u1ea1w. k\\u0300xn h\\u0304n\\u0302\\u0101 th\\u012b\\u0300\\u0301"
4754 " Unicode ca t\\u0304h\\u016bk s\\u0304r\\u0302\\u0101ng k\\u0304h\\u1ee5\\u0302n, d\\u1ecb\\u0302 m\\u012b "
4755 "rabb encoding xy\\u016b\\u0300 h\\u0304l\\u0101y r\\u0302xy rabb s\\u0304\\u1ea3h\\u0304r\\u1ea1b k\\u0101"
4756 "r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304el\\u0300\\u0101 n\\u012b\\u0302. m\\u1ecb\\u0300m"
4757 "\\u012b encoding d\\u0131 th\\u012b\\u0300 m\\u012b c\\u1ea3nwn t\\u1ea1w x\\u1ea1kk\\u0304hra m\\u0101k p"
4758 "he\\u012byng phx: yk t\\u1ea1wx\\u1ef3\\u0101ng ch\\u00e8n, c\\u0304heph\\u0101a n\\u0131 kl\\u00f9m s\\u0304"
4759 "h\\u0304p\\u0323h\\u0101ph yurop phe\\u012byng h\\u0304\\u00e6\\u0300ng de\\u012byw k\\u0306 t\\u0302xngk\\u0101"
4760 "r h\\u0304l\\u0101y encoding n\\u0131 k\\u0101r khrxbkhlum thuk p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 n\\u0131"
4761 " kl\\u00f9m. h\\u0304r\\u1ee5\\u0304x m\\u00e6\\u0302t\\u00e6\\u0300 n\\u0131 p\\u0323h\\u0101s\\u0304\\u02b9"
4762 "\\u0101 de\\u012b\\u0300yw ch\\u00e8n p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 x\\u1ea1ngkvs\\u0304\\u02b9 k\\u0306"
4763 " m\\u1ecb\\u0300m\\u012b encoding d\\u0131 th\\u012b\\u0300 phe\\u012byng phx s\\u0304\\u1ea3h\\u0304r\\u1ea1"
4764 "b thuk t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r, kher\\u1ee5\\u0304\\u0300xngh\\u0304m\\u0101y wrrkh txn l\\u00e6"
4765 "a s\\u0304\\u1ea1\\u1ef5l\\u1ea1ks\\u0304\\u02b9\\u1e47\\u0312 th\\u0101ng thekhnikh th\\u012b\\u0300 ch\\u0131"
4766 "\\u0302 k\\u1ea1n xy\\u016b\\u0300 th\\u1ea1\\u0300wp\\u1ecb.";
4767
4768
4769 UnicodeString xlitText(thaiText);
4770 xlitText = xlitText.unescape();
4771 tr->transliterate(xlitText);
4772
4773 UnicodeString expectedText(latinText);
4774 expectedText = expectedText.unescape();
4775 expect(*tr, xlitText, expectedText);
4776
4777 delete tr;
4778 #endif
4779 }
4780
4781
4782 //======================================================================
4783 // Support methods
4784 //======================================================================
expectT(const UnicodeString & id,const UnicodeString & source,const UnicodeString & expectedResult)4785 void TransliteratorTest::expectT(const UnicodeString& id,
4786 const UnicodeString& source,
4787 const UnicodeString& expectedResult) {
4788 UErrorCode ec = U_ZERO_ERROR;
4789 UParseError pe;
4790 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
4791 if (U_FAILURE(ec)) {
4792 errln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(ec));
4793 delete t;
4794 return;
4795 }
4796 expect(*t, source, expectedResult);
4797 delete t;
4798 }
4799
reportParseError(const UnicodeString & message,const UParseError & parseError,const UErrorCode & status)4800 void TransliteratorTest::reportParseError(const UnicodeString& message,
4801 const UParseError& parseError,
4802 const UErrorCode& status) {
4803 dataerrln(message +
4804 /*", parse error " + parseError.code +*/
4805 ", line " + parseError.line +
4806 ", offset " + parseError.offset +
4807 ", pre-context " + prettify(parseError.preContext, true) +
4808 ", post-context " + prettify(parseError.postContext,true) +
4809 ", Error: " + u_errorName(status));
4810 }
4811
expect(const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4812 void TransliteratorTest::expect(const UnicodeString& rules,
4813 const UnicodeString& source,
4814 const UnicodeString& expectedResult,
4815 UTransPosition *pos) {
4816 expect("<ID>", rules, source, expectedResult, pos);
4817 }
4818
expect(const UnicodeString & id,const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4819 void TransliteratorTest::expect(const UnicodeString& id,
4820 const UnicodeString& rules,
4821 const UnicodeString& source,
4822 const UnicodeString& expectedResult,
4823 UTransPosition *pos) {
4824 UErrorCode status = U_ZERO_ERROR;
4825 UParseError parseError;
4826 Transliterator* t = Transliterator::createFromRules(id, rules, UTRANS_FORWARD, parseError, status);
4827 if (U_FAILURE(status)) {
4828 reportParseError(UnicodeString("Couldn't create transliterator from ") + rules, parseError, status);
4829 } else {
4830 expect(*t, source, expectedResult, pos);
4831 }
4832 delete t;
4833 }
4834
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,const Transliterator & reverseTransliterator)4835 void TransliteratorTest::expect(const Transliterator& t,
4836 const UnicodeString& source,
4837 const UnicodeString& expectedResult,
4838 const Transliterator& reverseTransliterator) {
4839 expect(t, source, expectedResult);
4840 expect(reverseTransliterator, expectedResult, source);
4841 }
4842
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4843 void TransliteratorTest::expect(const Transliterator& t,
4844 const UnicodeString& source,
4845 const UnicodeString& expectedResult,
4846 UTransPosition *pos) {
4847 if (pos == 0) {
4848 UnicodeString result(source);
4849 t.transliterate(result);
4850 expectAux(t.getID() + ":String", source, result, expectedResult);
4851 }
4852 UTransPosition index={0, 0, 0, 0};
4853 if (pos != 0) {
4854 index = *pos;
4855 }
4856
4857 UnicodeString rsource(source);
4858 if (pos == 0) {
4859 t.transliterate(rsource);
4860 } else {
4861 // Do it all at once -- below we do it incrementally
4862 t.finishTransliteration(rsource, *pos);
4863 }
4864 expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
4865
4866 // Test keyboard (incremental) transliteration -- this result
4867 // must be the same after we finalize (see below).
4868 UnicodeString log;
4869 rsource.remove();
4870 if (pos != 0) {
4871 rsource = source;
4872 formatInput(log, rsource, index);
4873 log.append(" -> ");
4874 UErrorCode status = U_ZERO_ERROR;
4875 t.transliterate(rsource, index, status);
4876 formatInput(log, rsource, index);
4877 } else {
4878 for (int32_t i=0; i<source.length(); ++i) {
4879 if (i != 0) {
4880 log.append(" + ");
4881 }
4882 log.append(source.charAt(i)).append(" -> ");
4883 UErrorCode status = U_ZERO_ERROR;
4884 t.transliterate(rsource, index, source.charAt(i), status);
4885 formatInput(log, rsource, index);
4886 }
4887 }
4888
4889 // As a final step in keyboard transliteration, we must call
4890 // transliterate to finish off any pending partial matches that
4891 // were waiting for more input.
4892 t.finishTransliteration(rsource, index);
4893 log.append(" => ").append(rsource);
4894
4895 expectAux(t.getID() + ":Keyboard", log,
4896 rsource == expectedResult,
4897 expectedResult);
4898 }
4899
4900
4901 /**
4902 * @param appendTo result is appended to this param.
4903 * @param input the string being transliterated
4904 * @param pos the index struct
4905 */
formatInput(UnicodeString & appendTo,const UnicodeString & input,const UTransPosition & pos)4906 UnicodeString& TransliteratorTest::formatInput(UnicodeString &appendTo,
4907 const UnicodeString& input,
4908 const UTransPosition& pos) {
4909 // Output a string of the form aaa{bbb|ccc|ddd}eee, where
4910 // the {} indicate the context start and limit, and the ||
4911 // indicate the start and limit.
4912 if (0 <= pos.contextStart &&
4913 pos.contextStart <= pos.start &&
4914 pos.start <= pos.limit &&
4915 pos.limit <= pos.contextLimit &&
4916 pos.contextLimit <= input.length()) {
4917
4918 UnicodeString a, b, c, d, e;
4919 input.extractBetween(0, pos.contextStart, a);
4920 input.extractBetween(pos.contextStart, pos.start, b);
4921 input.extractBetween(pos.start, pos.limit, c);
4922 input.extractBetween(pos.limit, pos.contextLimit, d);
4923 input.extractBetween(pos.contextLimit, input.length(), e);
4924 appendTo.append(a).append((UChar)123/*{*/).append(b).
4925 append((UChar)PIPE).append(c).append((UChar)PIPE).append(d).
4926 append((UChar)125/*}*/).append(e);
4927 } else {
4928 appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
4929 pos.contextStart + ", s=" + pos.start + ", l=" +
4930 pos.limit + ", cl=" + pos.contextLimit + "} on " +
4931 input);
4932 }
4933 return appendTo;
4934 }
4935
expectAux(const UnicodeString & tag,const UnicodeString & source,const UnicodeString & result,const UnicodeString & expectedResult)4936 void TransliteratorTest::expectAux(const UnicodeString& tag,
4937 const UnicodeString& source,
4938 const UnicodeString& result,
4939 const UnicodeString& expectedResult) {
4940 expectAux(tag, source + " -> " + result,
4941 result == expectedResult,
4942 expectedResult);
4943 }
4944
expectAux(const UnicodeString & tag,const UnicodeString & summary,UBool pass,const UnicodeString & expectedResult)4945 void TransliteratorTest::expectAux(const UnicodeString& tag,
4946 const UnicodeString& summary, UBool pass,
4947 const UnicodeString& expectedResult) {
4948 if (pass) {
4949 logln(UnicodeString("(")+tag+") " + prettify(summary));
4950 } else {
4951 dataerrln(UnicodeString("FAIL: (")+tag+") "
4952 + prettify(summary)
4953 + ", expected " + prettify(expectedResult));
4954 }
4955 }
4956
4957 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
4958