1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1999-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 11/10/99 aliu Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "transtst.h"
18 #include "unicode/locid.h"
19 #include "unicode/dtfmtsym.h"
20 #include "unicode/normlzr.h"
21 #include "unicode/translit.h"
22 #include "unicode/uchar.h"
23 #include "unicode/unifilt.h"
24 #include "unicode/uniset.h"
25 #include "unicode/ustring.h"
26 #include "unicode/usetiter.h"
27 #include "unicode/uscript.h"
28 #include "unicode/utf16.h"
29 #include "cpdtrans.h"
30 #include "nultrans.h"
31 #include "rbt.h"
32 #include "rbt_pars.h"
33 #include "anytrans.h"
34 #include "esctrn.h"
35 #include "name2uni.h"
36 #include "nortrans.h"
37 #include "remtrans.h"
38 #include "titletrn.h"
39 #include "tolowtrn.h"
40 #include "toupptrn.h"
41 #include "unesctrn.h"
42 #include "uni2name.h"
43 #include "cstring.h"
44 #include "cmemory.h"
45 #include <stdio.h>
46
47 /***********************************************************************
48
49 HOW TO USE THIS TEST FILE
50 -or-
51 How I developed on two platforms
52 without losing (too much of) my mind
53
54
55 1. Add new tests by copying/pasting/changing existing tests. On Java,
56 any public void method named Test...() taking no parameters becomes
57 a test. On C++, you need to modify the header and add a line to
58 the runIndexedTest() dispatch method.
59
60 2. Make liberal use of the expect() method; it is your friend.
61
62 3. The tests in this file exactly match those in a sister file on the
63 other side. The two files are:
64
65 icu4j: src/com/ibm/test/translit/TransliteratorTest.java
66 icu4c: source/test/intltest/transtst.cpp
67
68 ==> THIS IS THE IMPORTANT PART <==
69
70 When you add a test in this file, add it in TransliteratorTest.java
71 too. Give it the same name and put it in the same relative place.
72 This makes maintenance a lot simpler for any poor soul who ends up
73 trying to synchronize the tests between icu4j and icu4c.
74
75 4. If you MUST enter a test that is NOT paralleled in the sister file,
76 then add it in the special non-mirrored section. These are
77 labeled
78
79 "icu4j ONLY"
80
81 or
82
83 "icu4c ONLY"
84
85 Make sure you document the reason the test is here and not there.
86
87
88 Thank you.
89 The Management
90 ***********************************************************************/
91
92 // Define character constants thusly to be EBCDIC-friendly
93 enum {
94 LEFT_BRACE=((UChar)0x007B), /*{*/
95 PIPE =((UChar)0x007C), /*|*/
96 ZERO =((UChar)0x0030), /*0*/
97 UPPER_A =((UChar)0x0041) /*A*/
98 };
99
TransliteratorTest()100 TransliteratorTest::TransliteratorTest()
101 : DESERET_DEE((UChar32)0x10414),
102 DESERET_dee((UChar32)0x1043C)
103 {
104 }
105
~TransliteratorTest()106 TransliteratorTest::~TransliteratorTest() {}
107
108 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)109 TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
110 const char* &name, char* /*par*/) {
111 switch (index) {
112 TESTCASE(0,TestInstantiation);
113 TESTCASE(1,TestSimpleRules);
114 TESTCASE(2,TestRuleBasedInverse);
115 TESTCASE(3,TestKeyboard);
116 TESTCASE(4,TestKeyboard2);
117 TESTCASE(5,TestKeyboard3);
118 TESTCASE(6,TestArabic);
119 TESTCASE(7,TestCompoundKana);
120 TESTCASE(8,TestCompoundHex);
121 TESTCASE(9,TestFiltering);
122 TESTCASE(10,TestInlineSet);
123 TESTCASE(11,TestPatternQuoting);
124 TESTCASE(12,TestJ277);
125 TESTCASE(13,TestJ243);
126 TESTCASE(14,TestJ329);
127 TESTCASE(15,TestSegments);
128 TESTCASE(16,TestCursorOffset);
129 TESTCASE(17,TestArbitraryVariableValues);
130 TESTCASE(18,TestPositionHandling);
131 TESTCASE(19,TestHiraganaKatakana);
132 TESTCASE(20,TestCopyJ476);
133 TESTCASE(21,TestAnchors);
134 TESTCASE(22,TestInterIndic);
135 TESTCASE(23,TestFilterIDs);
136 TESTCASE(24,TestCaseMap);
137 TESTCASE(25,TestNameMap);
138 TESTCASE(26,TestLiberalizedID);
139 TESTCASE(27,TestCreateInstance);
140 TESTCASE(28,TestNormalizationTransliterator);
141 TESTCASE(29,TestCompoundRBT);
142 TESTCASE(30,TestCompoundFilter);
143 TESTCASE(31,TestRemove);
144 TESTCASE(32,TestToRules);
145 TESTCASE(33,TestContext);
146 TESTCASE(34,TestSupplemental);
147 TESTCASE(35,TestQuantifier);
148 TESTCASE(36,TestSTV);
149 TESTCASE(37,TestCompoundInverse);
150 TESTCASE(38,TestNFDChainRBT);
151 TESTCASE(39,TestNullInverse);
152 TESTCASE(40,TestAliasInverseID);
153 TESTCASE(41,TestCompoundInverseID);
154 TESTCASE(42,TestUndefinedVariable);
155 TESTCASE(43,TestEmptyContext);
156 TESTCASE(44,TestCompoundFilterID);
157 TESTCASE(45,TestPropertySet);
158 TESTCASE(46,TestNewEngine);
159 TESTCASE(47,TestQuantifiedSegment);
160 TESTCASE(48,TestDevanagariLatinRT);
161 TESTCASE(49,TestTeluguLatinRT);
162 TESTCASE(50,TestCompoundLatinRT);
163 TESTCASE(51,TestSanskritLatinRT);
164 TESTCASE(52,TestLocaleInstantiation);
165 TESTCASE(53,TestTitleAccents);
166 TESTCASE(54,TestLocaleResource);
167 TESTCASE(55,TestParseError);
168 TESTCASE(56,TestOutputSet);
169 TESTCASE(57,TestVariableRange);
170 TESTCASE(58,TestInvalidPostContext);
171 TESTCASE(59,TestIDForms);
172 TESTCASE(60,TestToRulesMark);
173 TESTCASE(61,TestEscape);
174 TESTCASE(62,TestAnchorMasking);
175 TESTCASE(63,TestDisplayName);
176 TESTCASE(64,TestSpecialCases);
177 #if !UCONFIG_NO_FILE_IO
178 TESTCASE(65,TestIncrementalProgress);
179 #endif
180 TESTCASE(66,TestSurrogateCasing);
181 TESTCASE(67,TestFunction);
182 TESTCASE(68,TestInvalidBackRef);
183 TESTCASE(69,TestMulticharStringSet);
184 TESTCASE(70,TestUserFunction);
185 TESTCASE(71,TestAnyX);
186 TESTCASE(72,TestSourceTargetSet);
187 TESTCASE(73,TestGurmukhiDevanagari);
188 TESTCASE(74,TestPatternWhiteSpace);
189 TESTCASE(75,TestAllCodepoints);
190 TESTCASE(76,TestBoilerplate);
191 TESTCASE(77,TestAlternateSyntax);
192 TESTCASE(78,TestBeginEnd);
193 TESTCASE(79,TestBeginEndToRules);
194 TESTCASE(80,TestRegisterAlias);
195 TESTCASE(81,TestRuleStripping);
196 TESTCASE(82,TestHalfwidthFullwidth);
197 TESTCASE(83,TestThai);
198 TESTCASE(84,TestAny);
199 TESTCASE(85,TestBasicTransliteratorEvenWithoutData);
200 default: name = ""; break;
201 }
202 }
203
204 /**
205 * Make sure every system transliterator can be instantiated.
206 *
207 * ALSO test that the result of toRules() for each rule is a valid
208 * rule. Do this here so we don't have to have another test that
209 * instantiates everything as well.
210 */
TestInstantiation()211 void TransliteratorTest::TestInstantiation() {
212 UErrorCode ec = U_ZERO_ERROR;
213 StringEnumeration* avail = Transliterator::getAvailableIDs(ec);
214 assertSuccess("getAvailableIDs()", ec);
215 assertTrue("getAvailableIDs()!=NULL", avail!=NULL);
216 int32_t n = Transliterator::countAvailableIDs();
217 assertTrue("getAvailableIDs().count()==countAvailableIDs()",
218 avail->count(ec) == n);
219 assertSuccess("count()", ec);
220 UnicodeString name;
221 for (int32_t i=0; i<n; ++i) {
222 const UnicodeString& id = *avail->snext(ec);
223 if (!assertSuccess("snext()", ec) ||
224 !assertTrue("snext()!=NULL", (&id)!=NULL, TRUE)) {
225 break;
226 }
227 UnicodeString id2 = Transliterator::getAvailableID(i);
228 if (id.length() < 1) {
229 errln(UnicodeString("FAIL: getAvailableID(") +
230 i + ") returned empty string");
231 continue;
232 }
233 if (id != id2) {
234 errln(UnicodeString("FAIL: getAvailableID(") +
235 i + ") != getAvailableIDs().snext()");
236 continue;
237 }
238 UParseError parseError;
239 UErrorCode status = U_ZERO_ERROR;
240 Transliterator* t = Transliterator::createInstance(id,
241 UTRANS_FORWARD, parseError,status);
242 name.truncate(0);
243 Transliterator::getDisplayName(id, name);
244 if (t == 0) {
245 #if UCONFIG_NO_BREAK_ITERATION
246 // If UCONFIG_NO_BREAK_ITERATION is on, then only Thai should fail.
247 if (id.compare((UnicodeString)"Thai-Latn") != 0 &&
248 id.compare((UnicodeString)"Thai-Latin") != 0)
249 #endif
250 dataerrln(UnicodeString("FAIL: Couldn't create ") + id +
251 /*", parse error " + parseError.code +*/
252 ", line " + parseError.line +
253 ", offset " + parseError.offset +
254 ", pre-context " + prettify(parseError.preContext, TRUE) +
255 ", post-context " +prettify(parseError.postContext,TRUE) +
256 ", Error: " + u_errorName(status));
257 // When createInstance fails, it deletes the failing
258 // entry from the available ID list. We detect this
259 // here by looking for a change in countAvailableIDs.
260 int32_t nn = Transliterator::countAvailableIDs();
261 if (nn == (n - 1)) {
262 n = nn;
263 --i; // Compensate for deleted entry
264 }
265 } else {
266 logln(UnicodeString("OK: ") + name + " (" + id + ")");
267
268 // Now test toRules
269 UnicodeString rules;
270 t->toRules(rules, TRUE);
271 Transliterator *u = Transliterator::createFromRules("x",
272 rules, UTRANS_FORWARD, parseError,status);
273 if (u == 0) {
274 errln(UnicodeString("FAIL: ") + id +
275 ".createFromRules() => bad rules" +
276 /*", parse error " + parseError.code +*/
277 ", line " + parseError.line +
278 ", offset " + parseError.offset +
279 ", context " + prettify(parseError.preContext, TRUE) +
280 ", rules: " + prettify(rules, TRUE));
281 } else {
282 delete u;
283 }
284 delete t;
285 }
286 }
287 assertTrue("snext()==NULL", avail->snext(ec)==NULL);
288 assertSuccess("snext()", ec);
289 delete avail;
290
291 // Now test the failure path
292 UParseError parseError;
293 UErrorCode status = U_ZERO_ERROR;
294 UnicodeString id("<Not a valid Transliterator ID>");
295 Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
296 if (t != 0) {
297 errln("FAIL: " + id + " returned a transliterator");
298 delete t;
299 } else {
300 logln("OK: Bogus ID handled properly");
301 }
302 }
303
TestSimpleRules(void)304 void TransliteratorTest::TestSimpleRules(void) {
305 /* Example: rules 1. ab>x|y
306 * 2. yc>z
307 *
308 * []|eabcd start - no match, copy e to tranlated buffer
309 * [e]|abcd match rule 1 - copy output & adjust cursor
310 * [ex|y]cd match rule 2 - copy output & adjust cursor
311 * [exz]|d no match, copy d to transliterated buffer
312 * [exzd]| done
313 */
314 expect(UnicodeString("ab>x|y;", "") +
315 "yc>z",
316 "eabcd", "exzd");
317
318 /* Another set of rules:
319 * 1. ab>x|yzacw
320 * 2. za>q
321 * 3. qc>r
322 * 4. cw>n
323 *
324 * []|ab Rule 1
325 * [x|yzacw] No match
326 * [xy|zacw] Rule 2
327 * [xyq|cw] Rule 4
328 * [xyqn]| Done
329 */
330 expect(UnicodeString("ab>x|yzacw;") +
331 "za>q;" +
332 "qc>r;" +
333 "cw>n",
334 "ab", "xyqn");
335
336 /* Test categories
337 */
338 UErrorCode status = U_ZERO_ERROR;
339 UParseError parseError;
340 Transliterator *t = Transliterator::createFromRules(
341 "<ID>",
342 UnicodeString("$dummy=").append((UChar)0xE100) +
343 UnicodeString(";"
344 "$vowel=[aeiouAEIOU];"
345 "$lu=[:Lu:];"
346 "$vowel } $lu > '!';"
347 "$vowel > '&';"
348 "'!' { $lu > '^';"
349 "$lu > '*';"
350 "a > ERROR", ""),
351 UTRANS_FORWARD, parseError,
352 status);
353 if (U_FAILURE(status)) {
354 dataerrln("FAIL: RBT constructor failed - %s", u_errorName(status));
355 return;
356 }
357 expect(*t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
358 delete t;
359 }
360
361 /**
362 * Test inline set syntax and set variable syntax.
363 */
TestInlineSet(void)364 void TransliteratorTest::TestInlineSet(void) {
365 expect("{ [:Ll:] } x > y; [:Ll:] > z;", "aAbxq", "zAyzz");
366 expect("a[0-9]b > qrs", "1a7b9", "1qrs9");
367
368 expect(UnicodeString(
369 "$digit = [0-9];"
370 "$alpha = [a-zA-Z];"
371 "$alphanumeric = [$digit $alpha];" // ***
372 "$special = [^$alphanumeric];" // ***
373 "$alphanumeric > '-';"
374 "$special > '*';", ""),
375
376 "thx-1138", "---*----");
377 }
378
379 /**
380 * Create some inverses and confirm that they work. We have to be
381 * careful how we do this, since the inverses will not be true
382 * inverses -- we can't throw any random string at the composition
383 * of the transliterators and expect the identity function. F x
384 * F' != I. However, if we are careful about the input, we will
385 * get the expected results.
386 */
TestRuleBasedInverse(void)387 void TransliteratorTest::TestRuleBasedInverse(void) {
388 UnicodeString RULES =
389 UnicodeString("abc>zyx;") +
390 "ab>yz;" +
391 "bc>zx;" +
392 "ca>xy;" +
393 "a>x;" +
394 "b>y;" +
395 "c>z;" +
396
397 "abc<zyx;" +
398 "ab<yz;" +
399 "bc<zx;" +
400 "ca<xy;" +
401 "a<x;" +
402 "b<y;" +
403 "c<z;" +
404
405 "";
406
407 const char* DATA[] = {
408 // Careful here -- random strings will not work. If we keep
409 // the left side to the domain and the right side to the range
410 // we will be okay though (left, abc; right xyz).
411 "a", "x",
412 "abcacab", "zyxxxyy",
413 "caccb", "xyzzy",
414 };
415
416 int32_t DATA_length = UPRV_LENGTHOF(DATA);
417
418 UErrorCode status = U_ZERO_ERROR;
419 UParseError parseError;
420 Transliterator *fwd = Transliterator::createFromRules("<ID>", RULES,
421 UTRANS_FORWARD, parseError, status);
422 Transliterator *rev = Transliterator::createFromRules("<ID>", RULES,
423 UTRANS_REVERSE, parseError, status);
424 if (U_FAILURE(status)) {
425 errln("FAIL: RBT constructor failed");
426 return;
427 }
428 for (int32_t i=0; i<DATA_length; i+=2) {
429 expect(*fwd, DATA[i], DATA[i+1]);
430 expect(*rev, DATA[i+1], DATA[i]);
431 }
432 delete fwd;
433 delete rev;
434 }
435
436 /**
437 * Basic test of keyboard.
438 */
TestKeyboard(void)439 void TransliteratorTest::TestKeyboard(void) {
440 UParseError parseError;
441 UErrorCode status = U_ZERO_ERROR;
442 Transliterator *t = Transliterator::createFromRules("<ID>",
443 UnicodeString("psch>Y;")
444 +"ps>y;"
445 +"ch>x;"
446 +"a>A;",
447 UTRANS_FORWARD, parseError,
448 status);
449 if (U_FAILURE(status)) {
450 errln("FAIL: RBT constructor failed");
451 return;
452 }
453 const char* DATA[] = {
454 // insertion, buffer
455 "a", "A",
456 "p", "Ap",
457 "s", "Aps",
458 "c", "Apsc",
459 "a", "AycA",
460 "psch", "AycAY",
461 0, "AycAY", // null means finishKeyboardTransliteration
462 };
463
464 keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
465 delete t;
466 }
467
468 /**
469 * Basic test of keyboard with cursor.
470 */
TestKeyboard2(void)471 void TransliteratorTest::TestKeyboard2(void) {
472 UParseError parseError;
473 UErrorCode status = U_ZERO_ERROR;
474 Transliterator *t = Transliterator::createFromRules("<ID>",
475 UnicodeString("ych>Y;")
476 +"ps>|y;"
477 +"ch>x;"
478 +"a>A;",
479 UTRANS_FORWARD, parseError,
480 status);
481 if (U_FAILURE(status)) {
482 errln("FAIL: RBT constructor failed");
483 return;
484 }
485 const char* DATA[] = {
486 // insertion, buffer
487 "a", "A",
488 "p", "Ap",
489 "s", "Aps", // modified for rollback - "Ay",
490 "c", "Apsc", // modified for rollback - "Ayc",
491 "a", "AycA",
492 "p", "AycAp",
493 "s", "AycAps", // modified for rollback - "AycAy",
494 "c", "AycApsc", // modified for rollback - "AycAyc",
495 "h", "AycAY",
496 0, "AycAY", // null means finishKeyboardTransliteration
497 };
498
499 keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
500 delete t;
501 }
502
503 /**
504 * Test keyboard transliteration with back-replacement.
505 */
TestKeyboard3(void)506 void TransliteratorTest::TestKeyboard3(void) {
507 // We want th>z but t>y. Furthermore, during keyboard
508 // transliteration we want t>y then yh>z if t, then h are
509 // typed.
510 UnicodeString RULES("t>|y;"
511 "yh>z;");
512
513 const char* DATA[] = {
514 // Column 1: characters to add to buffer (as if typed)
515 // Column 2: expected appearance of buffer after
516 // keyboard xliteration.
517 "a", "a",
518 "b", "ab",
519 "t", "abt", // modified for rollback - "aby",
520 "c", "abyc",
521 "t", "abyct", // modified for rollback - "abycy",
522 "h", "abycz",
523 0, "abycz", // null means finishKeyboardTransliteration
524 };
525
526 UParseError parseError;
527 UErrorCode status = U_ZERO_ERROR;
528 Transliterator *t = Transliterator::createFromRules("<ID>", RULES, UTRANS_FORWARD, parseError, status);
529 if (U_FAILURE(status)) {
530 errln("FAIL: RBT constructor failed");
531 return;
532 }
533 keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
534 delete t;
535 }
536
keyboardAux(const Transliterator & t,const char * DATA[],int32_t DATA_length)537 void TransliteratorTest::keyboardAux(const Transliterator& t,
538 const char* DATA[], int32_t DATA_length) {
539 UErrorCode status = U_ZERO_ERROR;
540 UTransPosition index={0, 0, 0, 0};
541 UnicodeString s;
542 for (int32_t i=0; i<DATA_length; i+=2) {
543 UnicodeString log;
544 if (DATA[i] != 0) {
545 log = s + " + "
546 + DATA[i]
547 + " -> ";
548 t.transliterate(s, index, DATA[i], status);
549 } else {
550 log = s + " => ";
551 t.finishTransliteration(s, index);
552 }
553 // Show the start index '{' and the cursor '|'
554 UnicodeString a, b, c;
555 s.extractBetween(0, index.contextStart, a);
556 s.extractBetween(index.contextStart, index.start, b);
557 s.extractBetween(index.start, s.length(), c);
558 log.append(a).
559 append((UChar)LEFT_BRACE).
560 append(b).
561 append((UChar)PIPE).
562 append(c);
563 if (s == DATA[i+1] && U_SUCCESS(status)) {
564 logln(log);
565 } else {
566 errln(UnicodeString("FAIL: ") + log + ", expected " + DATA[i+1]);
567 }
568 }
569 }
570
TestArabic(void)571 void TransliteratorTest::TestArabic(void) {
572 // Test disabled for 2.0 until new Arabic transliterator can be written.
573 // /*
574 // const char* DATA[] = {
575 // "Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+
576 // "\u0627\u0644\u0644\u063a\u0629\u0020"+
577 // "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
578 // "\u0628\u0628\u0646\u0638\u0645\u0020"+
579 // "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
580 // "\u062c\u0645\u064a\u0644\u0629",
581 // };
582 // */
583 //
584 // UChar ar_raw[] = {
585 // 0x062a, 0x062a, 0x0645, 0x062a, 0x0639, 0x0020, 0x0627,
586 // 0x0644, 0x0644, 0x063a, 0x0629, 0x0020, 0x0627, 0x0644,
587 // 0x0639, 0x0631, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
588 // 0x0628, 0x0628, 0x0646, 0x0638, 0x0645, 0x0020, 0x0643,
589 // 0x062a, 0x0627, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
590 // 0x062c, 0x0645, 0x064a, 0x0644, 0x0629, 0
591 // };
592 // UnicodeString ar(ar_raw);
593 // UErrorCode status=U_ZERO_ERROR;
594 // UParseError parseError;
595 // Transliterator *t = Transliterator::createInstance("Latin-Arabic", UTRANS_FORWARD, parseError, status);
596 // if (t == 0) {
597 // errln("FAIL: createInstance failed");
598 // return;
599 // }
600 // expect(*t, "Arabic", ar);
601 // delete t;
602 }
603
604 /**
605 * Compose the Kana transliterator forward and reverse and try
606 * some strings that should come out unchanged.
607 */
TestCompoundKana(void)608 void TransliteratorTest::TestCompoundKana(void) {
609 UParseError parseError;
610 UErrorCode status = U_ZERO_ERROR;
611 Transliterator* t = Transliterator::createInstance("Latin-Hiragana;Hiragana-Latin", UTRANS_FORWARD, parseError, status);
612 if (t == 0) {
613 dataerrln("FAIL: construction of Latin-Hiragana;Hiragana-Latin failed - %s", u_errorName(status));
614 } else {
615 expect(*t, "aaaaa", "aaaaa");
616 delete t;
617 }
618 }
619
620 /**
621 * Compose the hex transliterators forward and reverse.
622 */
TestCompoundHex(void)623 void TransliteratorTest::TestCompoundHex(void) {
624 UParseError parseError;
625 UErrorCode status = U_ZERO_ERROR;
626 Transliterator* a = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
627 Transliterator* b = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, parseError, status);
628 Transliterator* transab[] = { a, b };
629 Transliterator* transba[] = { b, a };
630 if (a == 0 || b == 0) {
631 errln("FAIL: construction failed");
632 delete a;
633 delete b;
634 return;
635 }
636 // Do some basic tests of a
637 expect(*a, "01", UnicodeString("\\u0030\\u0031", ""));
638 // Do some basic tests of b
639 expect(*b, UnicodeString("\\u0030\\u0031", ""), "01");
640
641 Transliterator* ab = new CompoundTransliterator(transab, 2);
642 UnicodeString s("abcde", "");
643 expect(*ab, s, s);
644
645 UnicodeString str(s);
646 a->transliterate(str);
647 Transliterator* ba = new CompoundTransliterator(transba, 2);
648 expect(*ba, str, str);
649
650 delete ab;
651 delete ba;
652 delete a;
653 delete b;
654 }
655
656 int gTestFilterClassID = 0;
657 /**
658 * Used by TestFiltering().
659 */
660 class TestFilter : public UnicodeFilter {
clone() const661 virtual TestFilter* clone() const {
662 return new TestFilter(*this);
663 }
contains(UChar32 c) const664 virtual UBool contains(UChar32 c) const {
665 return c != (UChar)0x0063 /*c*/;
666 }
667 // Stubs
toPattern(UnicodeString & result,UBool) const668 virtual UnicodeString& toPattern(UnicodeString& result,
669 UBool /*escapeUnprintable*/) const {
670 return result;
671 }
matchesIndexValue(uint8_t) const672 virtual UBool matchesIndexValue(uint8_t /*v*/) const {
673 return FALSE;
674 }
addMatchSetTo(UnicodeSet &) const675 virtual void addMatchSetTo(UnicodeSet& /*toUnionTo*/) const {}
676 public:
getDynamicClassID() const677 UClassID getDynamicClassID() const { return (UClassID)&gTestFilterClassID; }
678 };
679
680 /**
681 * Do some basic tests of filtering.
682 */
TestFiltering(void)683 void TransliteratorTest::TestFiltering(void) {
684 UParseError parseError;
685 UErrorCode status = U_ZERO_ERROR;
686 Transliterator* hex = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
687 if (hex == 0) {
688 errln("FAIL: createInstance(Any-Hex) failed");
689 return;
690 }
691 hex->adoptFilter(new TestFilter());
692 UnicodeString s("abcde");
693 hex->transliterate(s);
694 UnicodeString exp("\\u0061\\u0062c\\u0064\\u0065", "");
695 if (s == exp) {
696 logln(UnicodeString("Ok: \"") + exp + "\"");
697 } else {
698 logln(UnicodeString("FAIL: \"") + s + "\", wanted \"" + exp + "\"");
699 }
700
701 // ICU4C ONLY. Do not find Transliterator.orphanFilter() in ICU4J.
702 UnicodeFilter *f = hex->orphanFilter();
703 if (f == NULL){
704 errln("FAIL: orphanFilter() should get a UnicodeFilter");
705 } else {
706 delete f;
707 }
708 delete hex;
709 }
710
711 /**
712 * Test anchors
713 */
TestAnchors(void)714 void TransliteratorTest::TestAnchors(void) {
715 expect(UnicodeString("^a > 0; a$ > 2 ; a > 1;", ""),
716 "aaa",
717 "012");
718 expect(UnicodeString("$s=[z$]; $s{a>0; a}$s>2; a>1;", ""),
719 "aaa",
720 "012");
721 expect(UnicodeString("^ab > 01 ;"
722 " ab > |8 ;"
723 " b > k ;"
724 " 8x$ > 45 ;"
725 " 8x > 77 ;", ""),
726
727 "ababbabxabx",
728 "018k7745");
729 expect(UnicodeString("$s = [z$] ;"
730 "$s{ab > 01 ;"
731 " ab > |8 ;"
732 " b > k ;"
733 " 8x}$s > 45 ;"
734 " 8x > 77 ;", ""),
735
736 "abzababbabxzabxabx",
737 "01z018k45z01x45");
738 }
739
740 /**
741 * Test pattern quoting and escape mechanisms.
742 */
TestPatternQuoting(void)743 void TransliteratorTest::TestPatternQuoting(void) {
744 // Array of 3n items
745 // Each item is <rules>, <input>, <expected output>
746 const UnicodeString DATA[] = {
747 UnicodeString(UChar(0x4E01)) + ">'[male adult]'",
748 UnicodeString(UChar(0x4E01)),
749 "[male adult]"
750 };
751
752 for (int32_t i=0; i<3; i+=3) {
753 logln(UnicodeString("Pattern: ") + prettify(DATA[i]));
754 UParseError parseError;
755 UErrorCode status = U_ZERO_ERROR;
756 Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
757 if (U_FAILURE(status)) {
758 errln("RBT constructor failed");
759 } else {
760 expect(*t, DATA[i+1], DATA[i+2]);
761 }
762 delete t;
763 }
764 }
765
766 /**
767 * Regression test for bugs found in Greek transliteration.
768 */
TestJ277(void)769 void TransliteratorTest::TestJ277(void) {
770 UErrorCode status = U_ZERO_ERROR;
771 UParseError parseError;
772 Transliterator *gl = Transliterator::createInstance("Greek-Latin; NFD; [:M:]Remove; NFC", UTRANS_FORWARD, parseError, status);
773 if (gl == NULL) {
774 dataerrln("FAIL: createInstance(Greek-Latin) returned NULL - %s", u_errorName(status));
775 return;
776 }
777
778 UChar sigma = 0x3C3;
779 UChar upsilon = 0x3C5;
780 UChar nu = 0x3BD;
781 // UChar PHI = 0x3A6;
782 UChar alpha = 0x3B1;
783 // UChar omega = 0x3C9;
784 // UChar omicron = 0x3BF;
785 // UChar epsilon = 0x3B5;
786
787 // sigma upsilon nu -> syn
788 UnicodeString syn;
789 syn.append(sigma).append(upsilon).append(nu);
790 expect(*gl, syn, "syn");
791
792 // sigma alpha upsilon nu -> saun
793 UnicodeString sayn;
794 sayn.append(sigma).append(alpha).append(upsilon).append(nu);
795 expect(*gl, sayn, "saun");
796
797 // Again, using a smaller rule set
798 UnicodeString rules(
799 "$alpha = \\u03B1;"
800 "$nu = \\u03BD;"
801 "$sigma = \\u03C3;"
802 "$ypsilon = \\u03C5;"
803 "$vowel = [aeiouAEIOU$alpha$ypsilon];"
804 "s <> $sigma;"
805 "a <> $alpha;"
806 "u <> $vowel { $ypsilon;"
807 "y <> $ypsilon;"
808 "n <> $nu;",
809 "");
810 Transliterator *mini = Transliterator::createFromRules("mini", rules, UTRANS_REVERSE, parseError, status);
811 if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
812 expect(*mini, syn, "syn");
813 expect(*mini, sayn, "saun");
814 delete mini;
815 mini = NULL;
816
817 #if !UCONFIG_NO_FORMATTING
818 // Transliterate the Greek locale data
819 Locale el("el");
820 DateFormatSymbols syms(el, status);
821 if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
822 int32_t i, count;
823 const UnicodeString* data = syms.getMonths(count);
824 for (i=0; i<count; ++i) {
825 if (data[i].length() == 0) {
826 continue;
827 }
828 UnicodeString out(data[i]);
829 gl->transliterate(out);
830 UBool ok = TRUE;
831 if (data[i].length() >= 2 && out.length() >= 2 &&
832 u_isupper(data[i].charAt(0)) && u_islower(data[i].charAt(1))) {
833 if (!(u_isupper(out.charAt(0)) && u_islower(out.charAt(1)))) {
834 ok = FALSE;
835 }
836 }
837 if (ok) {
838 logln(prettify(data[i] + " -> " + out));
839 } else {
840 errln(UnicodeString("FAIL: ") + prettify(data[i] + " -> " + out));
841 }
842 }
843 #endif
844
845 delete gl;
846 }
847
848 /**
849 * Prefix, suffix support in hex transliterators
850 */
TestJ243(void)851 void TransliteratorTest::TestJ243(void) {
852 UErrorCode ec = U_ZERO_ERROR;
853
854 // Test default Hex-Any, which should handle
855 // \u, \U, u+, and U+
856 Transliterator *hex =
857 Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, ec);
858 if (assertSuccess("getInstance", ec)) {
859 expect(*hex, UnicodeString("\\u0041+\\U00000042,U+0043uU+0044z", ""), "A+B,CuDz");
860 }
861 delete hex;
862
863 // // Try a custom Hex-Unicode
864 // // \uXXXX and &#xXXXX;
865 // ec = U_ZERO_ERROR;
866 // HexToUnicodeTransliterator hex2(UnicodeString("\\\\u###0;&\\#x###0\\;", ""), ec);
867 // expect(hex2, UnicodeString("\\u61\\u062\\u0063\\u00645\\u66x0123", ""),
868 // "abcd5fx0123");
869 // // Try custom Any-Hex (default is tested elsewhere)
870 // ec = U_ZERO_ERROR;
871 // UnicodeToHexTransliterator hex3(UnicodeString("&\\#x###0;", ""), ec);
872 // expect(hex3, "012", "012");
873 }
874
875 /**
876 * Parsers need better syntax error messages.
877 */
TestJ329(void)878 void TransliteratorTest::TestJ329(void) {
879
880 struct { UBool containsErrors; const char* rule; } DATA[] = {
881 { FALSE, "a > b; c > d" },
882 { TRUE, "a > b; no operator; c > d" },
883 };
884 int32_t DATA_length = UPRV_LENGTHOF(DATA);
885
886 for (int32_t i=0; i<DATA_length; ++i) {
887 UErrorCode status = U_ZERO_ERROR;
888 UParseError parseError;
889 Transliterator *rbt = Transliterator::createFromRules("<ID>",
890 DATA[i].rule,
891 UTRANS_FORWARD,
892 parseError,
893 status);
894 UBool gotError = U_FAILURE(status);
895 UnicodeString desc(DATA[i].rule);
896 desc.append(gotError ? " -> error" : " -> no error");
897 if (gotError) {
898 desc = desc + ", ParseError code=" + u_errorName(status) +
899 " line=" + parseError.line +
900 " offset=" + parseError.offset +
901 " context=" + parseError.preContext;
902 }
903 if (gotError == DATA[i].containsErrors) {
904 logln(UnicodeString("Ok: ") + desc);
905 } else {
906 errln(UnicodeString("FAIL: ") + desc);
907 }
908 delete rbt;
909 }
910 }
911
912 /**
913 * Test segments and segment references.
914 */
TestSegments(void)915 void TransliteratorTest::TestSegments(void) {
916 // Array of 3n items
917 // Each item is <rules>, <input>, <expected output>
918 UnicodeString DATA[] = {
919 "([a-z]) '.' ([0-9]) > $2 '-' $1",
920 "abc.123.xyz.456",
921 "ab1-c23.xy4-z56",
922
923 // nested
924 "(([a-z])([0-9])) > $1 '.' $2 '.' $3;",
925 "a1 b2",
926 "a1.a.1 b2.b.2",
927 };
928 int32_t DATA_length = UPRV_LENGTHOF(DATA);
929
930 for (int32_t i=0; i<DATA_length; i+=3) {
931 logln("Pattern: " + prettify(DATA[i]));
932 UParseError parseError;
933 UErrorCode status = U_ZERO_ERROR;
934 Transliterator *t = Transliterator::createFromRules("ID", DATA[i], UTRANS_FORWARD, parseError, status);
935 if (U_FAILURE(status)) {
936 errln("FAIL: RBT constructor");
937 } else {
938 expect(*t, DATA[i+1], DATA[i+2]);
939 }
940 delete t;
941 }
942 }
943
944 /**
945 * Test cursor positioning outside of the key
946 */
TestCursorOffset(void)947 void TransliteratorTest::TestCursorOffset(void) {
948 // Array of 3n items
949 // Each item is <rules>, <input>, <expected output>
950 UnicodeString DATA[] = {
951 "pre {alpha} post > | @ ALPHA ;"
952 "eALPHA > beta ;"
953 "pre {beta} post > BETA @@ | ;"
954 "post > xyz",
955
956 "prealphapost prebetapost",
957
958 "prbetaxyz preBETApost",
959 };
960 int32_t DATA_length = UPRV_LENGTHOF(DATA);
961
962 for (int32_t i=0; i<DATA_length; i+=3) {
963 logln("Pattern: " + prettify(DATA[i]));
964 UParseError parseError;
965 UErrorCode status = U_ZERO_ERROR;
966 Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
967 if (U_FAILURE(status)) {
968 errln("FAIL: RBT constructor");
969 } else {
970 expect(*t, DATA[i+1], DATA[i+2]);
971 }
972 delete t;
973 }
974 }
975
976 /**
977 * Test zero length and > 1 char length variable values. Test
978 * use of variable refs in UnicodeSets.
979 */
TestArbitraryVariableValues(void)980 void TransliteratorTest::TestArbitraryVariableValues(void) {
981 // Array of 3n items
982 // Each item is <rules>, <input>, <expected output>
983 UnicodeString DATA[] = {
984 "$abe = ab;"
985 "$pat = x[yY]z;"
986 "$ll = 'a-z';"
987 "$llZ = [$ll];"
988 "$llY = [$ll$pat];"
989 "$emp = ;"
990
991 "$abe > ABE;"
992 "$pat > END;"
993 "$llZ > 1;"
994 "$llY > 2;"
995 "7$emp 8 > 9;"
996 "",
997
998 "ab xYzxyz stY78",
999 "ABE ENDEND 1129",
1000 };
1001 int32_t DATA_length = UPRV_LENGTHOF(DATA);
1002
1003 for (int32_t i=0; i<DATA_length; i+=3) {
1004 logln("Pattern: " + prettify(DATA[i]));
1005 UParseError parseError;
1006 UErrorCode status = U_ZERO_ERROR;
1007 Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
1008 if (U_FAILURE(status)) {
1009 errln("FAIL: RBT constructor");
1010 } else {
1011 expect(*t, DATA[i+1], DATA[i+2]);
1012 }
1013 delete t;
1014 }
1015 }
1016
1017 /**
1018 * Confirm that the contextStart, contextLimit, start, and limit
1019 * behave correctly. J474.
1020 */
TestPositionHandling(void)1021 void TransliteratorTest::TestPositionHandling(void) {
1022 // Array of 3n items
1023 // Each item is <rules>, <input>, <expected output>
1024 const char* DATA[] = {
1025 "a{t} > SS ; {t}b > UU ; {t} > TT ;",
1026 "xtat txtb", // pos 0,9,0,9
1027 "xTTaSS TTxUUb",
1028
1029 "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1030 "xtat txtb", // pos 2,9,3,8
1031 "xtaSS TTxUUb",
1032
1033 "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1034 "xtat txtb", // pos 3,8,3,8
1035 "xtaTT TTxTTb",
1036 };
1037
1038 // Array of 4n positions -- these go with the DATA array
1039 // They are: contextStart, contextLimit, start, limit
1040 int32_t POS[] = {
1041 0, 9, 0, 9,
1042 2, 9, 3, 8,
1043 3, 8, 3, 8,
1044 };
1045
1046 int32_t n = UPRV_LENGTHOF(DATA) / 3;
1047 for (int32_t i=0; i<n; i++) {
1048 UErrorCode status = U_ZERO_ERROR;
1049 UParseError parseError;
1050 Transliterator *t = Transliterator::createFromRules("<ID>",
1051 DATA[3*i], UTRANS_FORWARD, parseError, status);
1052 if (U_FAILURE(status)) {
1053 delete t;
1054 errln("FAIL: RBT constructor");
1055 return;
1056 }
1057 UTransPosition pos;
1058 pos.contextStart= POS[4*i];
1059 pos.contextLimit = POS[4*i+1];
1060 pos.start = POS[4*i+2];
1061 pos.limit = POS[4*i+3];
1062 UnicodeString rsource(DATA[3*i+1]);
1063 t->transliterate(rsource, pos, status);
1064 if (U_FAILURE(status)) {
1065 delete t;
1066 errln("FAIL: transliterate");
1067 return;
1068 }
1069 t->finishTransliteration(rsource, pos);
1070 expectAux(DATA[3*i],
1071 DATA[3*i+1],
1072 rsource,
1073 DATA[3*i+2]);
1074 delete t;
1075 }
1076 }
1077
1078 /**
1079 * Test the Hiragana-Katakana transliterator.
1080 */
TestHiraganaKatakana(void)1081 void TransliteratorTest::TestHiraganaKatakana(void) {
1082 UParseError parseError;
1083 UErrorCode status = U_ZERO_ERROR;
1084 Transliterator* hk = Transliterator::createInstance("Hiragana-Katakana", UTRANS_FORWARD, parseError, status);
1085 Transliterator* kh = Transliterator::createInstance("Katakana-Hiragana", UTRANS_FORWARD, parseError, status);
1086 if (hk == 0 || kh == 0) {
1087 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1088 delete hk;
1089 delete kh;
1090 return;
1091 }
1092
1093 // Array of 3n items
1094 // Each item is "hk"|"kh"|"both", <Hiragana>, <Katakana>
1095 const char* DATA[] = {
1096 "both",
1097 "\\u3042\\u3090\\u3099\\u3092\\u3050",
1098 "\\u30A2\\u30F8\\u30F2\\u30B0",
1099
1100 "kh",
1101 "\\u307C\\u3051\\u3060\\u3042\\u3093\\u30FC",
1102 "\\u30DC\\u30F6\\u30C0\\u30FC\\u30F3\\u30FC",
1103 };
1104 int32_t DATA_length = UPRV_LENGTHOF(DATA);
1105
1106 for (int32_t i=0; i<DATA_length; i+=3) {
1107 UnicodeString h = CharsToUnicodeString(DATA[i+1]);
1108 UnicodeString k = CharsToUnicodeString(DATA[i+2]);
1109 switch (*DATA[i]) {
1110 case 0x68: //'h': // Hiragana-Katakana
1111 expect(*hk, h, k);
1112 break;
1113 case 0x6B: //'k': // Katakana-Hiragana
1114 expect(*kh, k, h);
1115 break;
1116 case 0x62: //'b': // both
1117 expect(*hk, h, k);
1118 expect(*kh, k, h);
1119 break;
1120 }
1121 }
1122 delete hk;
1123 delete kh;
1124 }
1125
1126 /**
1127 * Test cloning / copy constructor of RBT.
1128 */
TestCopyJ476(void)1129 void TransliteratorTest::TestCopyJ476(void) {
1130 // The real test here is what happens when the destructors are
1131 // called. So we let one object get destructed, and check to
1132 // see that its copy still works.
1133 Transliterator *t2 = 0;
1134 {
1135 UParseError parseError;
1136 UErrorCode status = U_ZERO_ERROR;
1137 Transliterator *t1 = Transliterator::createFromRules("t1",
1138 "a>A;b>B;'foo'+>'bar'", UTRANS_FORWARD, parseError, status);
1139 if (U_FAILURE(status)) {
1140 errln("FAIL: RBT constructor");
1141 return;
1142 }
1143 t2 = t1->clone(); // Call copy constructor under the covers.
1144 expect(*t1, "abcfoofoo", "ABcbar");
1145 delete t1;
1146 }
1147 expect(*t2, "abcfoofoo", "ABcbar");
1148 delete t2;
1149 }
1150
1151 /**
1152 * Test inter-Indic transliterators. These are composed.
1153 * ICU4C Jitterbug 483.
1154 */
TestInterIndic(void)1155 void TransliteratorTest::TestInterIndic(void) {
1156 UnicodeString ID("Devanagari-Gujarati", "");
1157 UErrorCode status = U_ZERO_ERROR;
1158 UParseError parseError;
1159 Transliterator* dg = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1160 if (dg == 0) {
1161 dataerrln("FAIL: createInstance(" + ID + ") returned NULL - " + u_errorName(status));
1162 return;
1163 }
1164 UnicodeString id = dg->getID();
1165 if (id != ID) {
1166 errln("FAIL: createInstance(" + ID + ")->getID() => " + id);
1167 }
1168 UnicodeString dev = CharsToUnicodeString("\\u0901\\u090B\\u0925");
1169 UnicodeString guj = CharsToUnicodeString("\\u0A81\\u0A8B\\u0AA5");
1170 expect(*dg, dev, guj);
1171 delete dg;
1172 }
1173
1174 /**
1175 * Test filter syntax in IDs. (J918)
1176 */
TestFilterIDs(void)1177 void TransliteratorTest::TestFilterIDs(void) {
1178 // Array of 3n strings:
1179 // <id>, <inverse id>, <input>, <expected output>
1180 const char* DATA[] = {
1181 "[aeiou]Any-Hex", // ID
1182 "[aeiou]Hex-Any", // expected inverse ID
1183 "quizzical", // src
1184 "q\\u0075\\u0069zz\\u0069c\\u0061l", // expected ID.translit(src)
1185
1186 "[aeiou]Any-Hex;[^5]Hex-Any",
1187 "[^5]Any-Hex;[aeiou]Hex-Any",
1188 "quizzical",
1189 "q\\u0075izzical",
1190
1191 "[abc]Null",
1192 "[abc]Null",
1193 "xyz",
1194 "xyz",
1195 };
1196 enum { DATA_length = UPRV_LENGTHOF(DATA) };
1197
1198 for (int i=0; i<DATA_length; i+=4) {
1199 UnicodeString ID(DATA[i], "");
1200 UnicodeString uID(DATA[i+1], "");
1201 UnicodeString data2(DATA[i+2], "");
1202 UnicodeString data3(DATA[i+3], "");
1203 UParseError parseError;
1204 UErrorCode status = U_ZERO_ERROR;
1205 Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1206 if (t == 0) {
1207 errln("FAIL: createInstance(" + ID + ") returned NULL");
1208 return;
1209 }
1210 expect(*t, data2, data3);
1211
1212 // Check the ID
1213 if (ID != t->getID()) {
1214 errln("FAIL: createInstance(" + ID + ").getID() => " +
1215 t->getID());
1216 }
1217
1218 // Check the inverse
1219 Transliterator *u = t->createInverse(status);
1220 if (u == 0) {
1221 errln("FAIL: " + ID + ".createInverse() returned NULL");
1222 } else if (u->getID() != uID) {
1223 errln("FAIL: " + ID + ".createInverse().getID() => " +
1224 u->getID() + ", expected " + uID);
1225 }
1226
1227 delete t;
1228 delete u;
1229 }
1230 }
1231
1232 /**
1233 * Test the case mapping transliterators.
1234 */
TestCaseMap(void)1235 void TransliteratorTest::TestCaseMap(void) {
1236 UParseError parseError;
1237 UErrorCode status = U_ZERO_ERROR;
1238 Transliterator* toUpper =
1239 Transliterator::createInstance("Any-Upper[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1240 Transliterator* toLower =
1241 Transliterator::createInstance("Any-Lower[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1242 Transliterator* toTitle =
1243 Transliterator::createInstance("Any-Title[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1244 if (toUpper==0 || toLower==0 || toTitle==0) {
1245 errln("FAIL: createInstance returned NULL");
1246 delete toUpper;
1247 delete toLower;
1248 delete toTitle;
1249 return;
1250 }
1251
1252 expect(*toUpper, "The quick brown fox jumped over the lazy dogs.",
1253 "THE QUICK BROWN FOx JUMPED OVER THE LAzy DOGS.");
1254 expect(*toLower, "The quIck brown fOX jUMPED OVER THE LAzY dogs.",
1255 "the quick brown foX jumped over the lazY dogs.");
1256 expect(*toTitle, "the quick brown foX can't jump over the laZy dogs.",
1257 "The Quick Brown FoX Can't Jump Over The LaZy Dogs.");
1258
1259 delete toUpper;
1260 delete toLower;
1261 delete toTitle;
1262 }
1263
1264 /**
1265 * Test the name mapping transliterators.
1266 */
TestNameMap(void)1267 void TransliteratorTest::TestNameMap(void) {
1268 UParseError parseError;
1269 UErrorCode status = U_ZERO_ERROR;
1270 Transliterator* uni2name =
1271 Transliterator::createInstance("Any-Name[^abc]", UTRANS_FORWARD, parseError, status);
1272 Transliterator* name2uni =
1273 Transliterator::createInstance("Name-Any", UTRANS_FORWARD, parseError, status);
1274 if (uni2name==0 || name2uni==0) {
1275 errln("FAIL: createInstance returned NULL");
1276 delete uni2name;
1277 delete name2uni;
1278 return;
1279 }
1280
1281 // Careful: CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1282 expect(*uni2name, CharsToUnicodeString("\\u00A0abc\\u4E01\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF"),
1283 CharsToUnicodeString("\\\\N{NO-BREAK SPACE}abc\\\\N{CJK UNIFIED IDEOGRAPH-4E01}\\\\N{MICRO SIGN}\\\\N{GUJARATI SIGN CANDRABINDU}\\\\N{REPLACEMENT CHARACTER}\\\\N{<control-0004>}\\\\N{<control-0009>}\\\\N{<control-0081>}\\\\N{<noncharacter-FFFF>}"));
1284 expect(*name2uni, UNICODE_STRING_SIMPLE("{\\N { NO-BREAK SPACE}abc\\N{ CJK UNIFIED IDEOGRAPH-4E01 }\\N{x\\N{MICRO SIGN}\\N{GUJARATI SIGN CANDRABINDU}\\N{REPLACEMENT CHARACTER}\\N{<control-0004>}\\N{<control-0009>}\\N{<control-0081>}\\N{<noncharacter-FFFF>}\\N{<control-0004>}\\N{"),
1285 CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{"));
1286
1287 delete uni2name;
1288 delete name2uni;
1289
1290 // round trip
1291 Transliterator* t =
1292 Transliterator::createInstance("Any-Name;Name-Any", UTRANS_FORWARD, parseError, status);
1293 if (t==0) {
1294 errln("FAIL: createInstance returned NULL");
1295 delete t;
1296 return;
1297 }
1298
1299 // Careful: CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1300 UnicodeString s = CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{");
1301 expect(*t, s, s);
1302 delete t;
1303 }
1304
1305 /**
1306 * Test liberalized ID syntax. 1006c
1307 */
TestLiberalizedID(void)1308 void TransliteratorTest::TestLiberalizedID(void) {
1309 // Some test cases have an expected getID() value of NULL. This
1310 // means I have disabled the test case for now. This stuff is
1311 // still under development, and I haven't decided whether to make
1312 // getID() return canonical case yet. It will all get rewritten
1313 // with the move to Source-Target/Variant IDs anyway. [aliu]
1314 const char* DATA[] = {
1315 "latin-greek", NULL /*"Latin-Greek"*/, "case insensitivity",
1316 " Null ", "Null", "whitespace",
1317 " Latin[a-z]-Greek ", "[a-z]Latin-Greek", "inline filter",
1318 " null ; latin-greek ", NULL /*"Null;Latin-Greek"*/, "compound whitespace",
1319 };
1320 const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1321 UParseError parseError;
1322 UErrorCode status= U_ZERO_ERROR;
1323 for (int32_t i=0; i<DATA_length; i+=3) {
1324 Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, parseError, status);
1325 if (t == 0) {
1326 dataerrln(UnicodeString("FAIL: ") + DATA[i+2] +
1327 " cannot create ID \"" + DATA[i] + "\" - " + u_errorName(status));
1328 } else {
1329 UnicodeString exp;
1330 if (DATA[i+1]) {
1331 exp = UnicodeString(DATA[i+1], "");
1332 }
1333 // Don't worry about getID() if the expected char*
1334 // is NULL -- see above.
1335 if (exp.length() == 0 || exp == t->getID()) {
1336 logln(UnicodeString("Ok: ") + DATA[i+2] +
1337 " create ID \"" + DATA[i] + "\" => \"" +
1338 exp + "\"");
1339 } else {
1340 errln(UnicodeString("FAIL: ") + DATA[i+2] +
1341 " create ID \"" + DATA[i] + "\" => \"" +
1342 t->getID() + "\", exp \"" + exp + "\"");
1343 }
1344 delete t;
1345 }
1346 }
1347 }
1348
1349 /* test for Jitterbug 912 */
TestCreateInstance()1350 void TransliteratorTest::TestCreateInstance(){
1351 const char* FORWARD = "F";
1352 const char* REVERSE = "R";
1353 const char* DATA[] = {
1354 // Column 1: id
1355 // Column 2: direction
1356 // Column 3: expected ID, or "" if expect failure
1357 "Latin-Hangul", REVERSE, "Hangul-Latin", // JB#912
1358
1359 // JB#2689: bad compound causes crash
1360 "InvalidSource-InvalidTarget", FORWARD, "",
1361 "InvalidSource-InvalidTarget", REVERSE, "",
1362 "Hex-Any;InvalidSource-InvalidTarget", FORWARD, "",
1363 "Hex-Any;InvalidSource-InvalidTarget", REVERSE, "",
1364 "InvalidSource-InvalidTarget;Hex-Any", FORWARD, "",
1365 "InvalidSource-InvalidTarget;Hex-Any", REVERSE, "",
1366
1367 NULL
1368 };
1369
1370 for (int32_t i=0; DATA[i]; i+=3) {
1371 UParseError err;
1372 UErrorCode ec = U_ZERO_ERROR;
1373 UnicodeString id(DATA[i]);
1374 UTransDirection dir = (DATA[i+1]==FORWARD)?
1375 UTRANS_FORWARD:UTRANS_REVERSE;
1376 UnicodeString expID(DATA[i+2]);
1377 Transliterator* t =
1378 Transliterator::createInstance(id,dir,err,ec);
1379 UnicodeString newID;
1380 if (t) {
1381 newID = t->getID();
1382 }
1383 UBool ok = (newID == expID);
1384 if (!t) {
1385 newID = u_errorName(ec);
1386 }
1387 if (ok) {
1388 logln((UnicodeString)"Ok: createInstance(" +
1389 id + "," + DATA[i+1] + ") => " + newID);
1390 } else {
1391 dataerrln((UnicodeString)"FAIL: createInstance(" +
1392 id + "," + DATA[i+1] + ") => " + newID +
1393 ", expected " + expID);
1394 }
1395 delete t;
1396 }
1397 }
1398
1399 /**
1400 * Test the normalization transliterator.
1401 */
TestNormalizationTransliterator()1402 void TransliteratorTest::TestNormalizationTransliterator() {
1403 // THE FOLLOWING TWO TABLES ARE COPIED FROM com.ibm.test.normalizer.BasicTest
1404 // PLEASE KEEP THEM IN SYNC WITH BasicTest.
1405 const char* CANON[] = {
1406 // Input Decomposed Composed
1407 "cat", "cat", "cat" ,
1408 "\\u00e0ardvark", "a\\u0300ardvark", "\\u00e0ardvark" ,
1409
1410 "\\u1e0a", "D\\u0307", "\\u1e0a" , // D-dot_above
1411 "D\\u0307", "D\\u0307", "\\u1e0a" , // D dot_above
1412
1413 "\\u1e0c\\u0307", "D\\u0323\\u0307", "\\u1e0c\\u0307" , // D-dot_below dot_above
1414 "\\u1e0a\\u0323", "D\\u0323\\u0307", "\\u1e0c\\u0307" , // D-dot_above dot_below
1415 "D\\u0307\\u0323", "D\\u0323\\u0307", "\\u1e0c\\u0307" , // D dot_below dot_above
1416
1417 "\\u1e10\\u0307\\u0323", "D\\u0327\\u0323\\u0307","\\u1e10\\u0323\\u0307", // D dot_below cedilla dot_above
1418 "D\\u0307\\u0328\\u0323","D\\u0328\\u0323\\u0307","\\u1e0c\\u0328\\u0307", // D dot_above ogonek dot_below
1419
1420 "\\u1E14", "E\\u0304\\u0300", "\\u1E14" , // E-macron-grave
1421 "\\u0112\\u0300", "E\\u0304\\u0300", "\\u1E14" , // E-macron + grave
1422 "\\u00c8\\u0304", "E\\u0300\\u0304", "\\u00c8\\u0304" , // E-grave + macron
1423
1424 "\\u212b", "A\\u030a", "\\u00c5" , // angstrom_sign
1425 "\\u00c5", "A\\u030a", "\\u00c5" , // A-ring
1426
1427 "\\u00fdffin", "y\\u0301ffin", "\\u00fdffin" , //updated with 3.0
1428 "\\u00fd\\uFB03n", "y\\u0301\\uFB03n", "\\u00fd\\uFB03n" , //updated with 3.0
1429
1430 "Henry IV", "Henry IV", "Henry IV" ,
1431 "Henry \\u2163", "Henry \\u2163", "Henry \\u2163" ,
1432
1433 "\\u30AC", "\\u30AB\\u3099", "\\u30AC" , // ga (Katakana)
1434 "\\u30AB\\u3099", "\\u30AB\\u3099", "\\u30AC" , // ka + ten
1435 "\\uFF76\\uFF9E", "\\uFF76\\uFF9E", "\\uFF76\\uFF9E" , // hw_ka + hw_ten
1436 "\\u30AB\\uFF9E", "\\u30AB\\uFF9E", "\\u30AB\\uFF9E" , // ka + hw_ten
1437 "\\uFF76\\u3099", "\\uFF76\\u3099", "\\uFF76\\u3099" , // hw_ka + ten
1438
1439 "A\\u0300\\u0316", "A\\u0316\\u0300", "\\u00C0\\u0316" ,
1440 0 // end
1441 };
1442
1443 const char* COMPAT[] = {
1444 // Input Decomposed Composed
1445 "\\uFB4f", "\\u05D0\\u05DC", "\\u05D0\\u05DC" , // Alef-Lamed vs. Alef, Lamed
1446
1447 "\\u00fdffin", "y\\u0301ffin", "\\u00fdffin" , //updated for 3.0
1448 "\\u00fd\\uFB03n", "y\\u0301ffin", "\\u00fdffin" , // ffi ligature -> f + f + i
1449
1450 "Henry IV", "Henry IV", "Henry IV" ,
1451 "Henry \\u2163", "Henry IV", "Henry IV" ,
1452
1453 "\\u30AC", "\\u30AB\\u3099", "\\u30AC" , // ga (Katakana)
1454 "\\u30AB\\u3099", "\\u30AB\\u3099", "\\u30AC" , // ka + ten
1455
1456 "\\uFF76\\u3099", "\\u30AB\\u3099", "\\u30AC" , // hw_ka + ten
1457 0 // end
1458 };
1459
1460 int32_t i;
1461 UParseError parseError;
1462 UErrorCode status = U_ZERO_ERROR;
1463 Transliterator* NFD = Transliterator::createInstance("NFD", UTRANS_FORWARD, parseError, status);
1464 Transliterator* NFC = Transliterator::createInstance("NFC", UTRANS_FORWARD, parseError, status);
1465 if (!NFD || !NFC) {
1466 dataerrln("FAIL: createInstance failed: %s", u_errorName(status));
1467 delete NFD;
1468 delete NFC;
1469 return;
1470 }
1471 for (i=0; CANON[i]; i+=3) {
1472 UnicodeString in = CharsToUnicodeString(CANON[i]);
1473 UnicodeString expd = CharsToUnicodeString(CANON[i+1]);
1474 UnicodeString expc = CharsToUnicodeString(CANON[i+2]);
1475 expect(*NFD, in, expd);
1476 expect(*NFC, in, expc);
1477 }
1478 delete NFD;
1479 delete NFC;
1480
1481 Transliterator* NFKD = Transliterator::createInstance("NFKD", UTRANS_FORWARD, parseError, status);
1482 Transliterator* NFKC = Transliterator::createInstance("NFKC", UTRANS_FORWARD, parseError, status);
1483 if (!NFKD || !NFKC) {
1484 dataerrln("FAIL: createInstance failed");
1485 delete NFKD;
1486 delete NFKC;
1487 return;
1488 }
1489 for (i=0; COMPAT[i]; i+=3) {
1490 UnicodeString in = CharsToUnicodeString(COMPAT[i]);
1491 UnicodeString expkd = CharsToUnicodeString(COMPAT[i+1]);
1492 UnicodeString expkc = CharsToUnicodeString(COMPAT[i+2]);
1493 expect(*NFKD, in, expkd);
1494 expect(*NFKC, in, expkc);
1495 }
1496 delete NFKD;
1497 delete NFKC;
1498
1499 UParseError pe;
1500 status = U_ZERO_ERROR;
1501 Transliterator *t = Transliterator::createInstance("NFD; [x]Remove",
1502 UTRANS_FORWARD,
1503 pe, status);
1504 if (t == 0) {
1505 errln("FAIL: createInstance failed");
1506 }
1507 expect(*t, CharsToUnicodeString("\\u010dx"),
1508 CharsToUnicodeString("c\\u030C"));
1509 delete t;
1510 }
1511
1512 /**
1513 * Test we can create basic transliterator even without data.
1514 */
TestBasicTransliteratorEvenWithoutData()1515 void TransliteratorTest::TestBasicTransliteratorEvenWithoutData() {
1516 const char16_t* TEST_DATA = u"\u0124e\u0301 \uFB01nd x";
1517 const char16_t* EXPECTED_RESULTS[] = {
1518 u"H\u0302e\u0301 \uFB01nd x", // NFD
1519 u"\u0124\u00E9 \uFB01nd x", // NFC
1520 u"H\u0302e\u0301 find x", // NFKD
1521 u"\u0124\u00E9 find x", // NFKC
1522 u"\u0124e\u0301 \uFB01nd x", // Hex-Any
1523 u"\u0125e\u0301 \uFB01nd x", // Lower
1524 u"\u0124e\uFB01ndx", // [:^L:]Remove
1525 u"H\u0302e\u0301 \uFB01nd ", // NFD; [x]Remove
1526 u"h\u0302e\u0301 find x", // Lower; NFKD;
1527 u"hefindx", // Lower; NFKD; [:^L:]Remove; NFC;
1528 u"\u0124e \uFB01nd x", // [:Nonspacing Mark:] Remove;
1529 u"He \uFB01nd x", // NFD; [:Nonspacing Mark:] Remove; NFC;
1530 // end
1531 0
1532 };
1533
1534 const char* BASIC_TRANSLITERATOR_ID[] = {
1535 "NFD",
1536 "NFC",
1537 "NFKD",
1538 "NFKC",
1539 "Hex-Any",
1540 "Lower",
1541 "[:^L:]Remove",
1542 "NFD; [x]Remove",
1543 "Lower; NFKD;",
1544 "Lower; NFKD; [:^L:]Remove; NFC;",
1545 "[:Nonspacing Mark:] Remove;",
1546 "NFD; [:Nonspacing Mark:] Remove; NFC;",
1547 // end
1548 0
1549 };
1550 const char* BASIC_TRANSLITERATOR_RULES[] = {
1551 "::Lower; ::NFKD;",
1552 "::Lower; ::NFKD; ::[:^L:]Remove; ::NFC;",
1553 "::[:Nonspacing Mark:] Remove;",
1554 "::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;",
1555 // end
1556 0
1557 };
1558 for (int32_t i=0; BASIC_TRANSLITERATOR_ID[i]; i++) {
1559 UErrorCode status = U_ZERO_ERROR;
1560 UParseError parseError;
1561 std::unique_ptr<Transliterator> translit(Transliterator::createInstance(
1562 BASIC_TRANSLITERATOR_ID[i], UTRANS_FORWARD, parseError, status));
1563 if (translit.get() == nullptr || !U_SUCCESS(status)) {
1564 dataerrln("FAIL: createInstance %s failed", BASIC_TRANSLITERATOR_ID[i]);
1565 continue;
1566 }
1567 UnicodeString data(TEST_DATA);
1568 UnicodeString expected(EXPECTED_RESULTS[i]);
1569 translit->transliterate(data);
1570 if (data != expected) {
1571 dataerrln(UnicodeString("FAIL: expected translit(") +
1572 BASIC_TRANSLITERATOR_ID[i] + ") = '" +
1573 EXPECTED_RESULTS[i] + "' but got '" + data);
1574 continue;
1575 }
1576 }
1577 for (int32_t i=0; BASIC_TRANSLITERATOR_RULES[i]; i++) {
1578 UErrorCode status = U_ZERO_ERROR;
1579 UParseError parseError;
1580 std::unique_ptr<Transliterator> translit(Transliterator::createFromRules(
1581 "Test",
1582 BASIC_TRANSLITERATOR_RULES[i], UTRANS_FORWARD, parseError, status));
1583 if (translit.get() == nullptr || !U_SUCCESS(status)) {
1584 dataerrln("FAIL: createFromRules %s failed", BASIC_TRANSLITERATOR_RULES[i]);
1585 continue;
1586 }
1587 }
1588 }
1589
1590 /**
1591 * Test compound RBT rules.
1592 */
TestCompoundRBT(void)1593 void TransliteratorTest::TestCompoundRBT(void) {
1594 // Careful with spacing and ';' here: Phrase this exactly
1595 // as toRules() is going to return it. If toRules() changes
1596 // with regard to spacing or ';', then adjust this string.
1597 UnicodeString rule("::Hex-Any;\n"
1598 "::Any-Lower;\n"
1599 "a > '.A.';\n"
1600 "b > '.B.';\n"
1601 "::[^t]Any-Upper;", "");
1602 UParseError parseError;
1603 UErrorCode status = U_ZERO_ERROR;
1604 Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, parseError, status);
1605 if (t == 0) {
1606 errln("FAIL: createFromRules failed");
1607 return;
1608 }
1609 expect(*t, UNICODE_STRING_SIMPLE("\\u0043at in the hat, bat on the mat"),
1610 "C.A.t IN tHE H.A.t, .B..A.t ON tHE M.A.t");
1611 UnicodeString r;
1612 t->toRules(r, TRUE);
1613 if (r == rule) {
1614 logln((UnicodeString)"OK: toRules() => " + r);
1615 } else {
1616 errln((UnicodeString)"FAIL: toRules() => " + r +
1617 ", expected " + rule);
1618 }
1619 delete t;
1620
1621 // Now test toRules
1622 t = Transliterator::createInstance("Greek-Latin; Latin-Cyrillic", UTRANS_FORWARD, parseError, status);
1623 if (t == 0) {
1624 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1625 return;
1626 }
1627 UnicodeString exp("::Greek-Latin;\n::Latin-Cyrillic;");
1628 t->toRules(r, TRUE);
1629 if (r != exp) {
1630 errln((UnicodeString)"FAIL: toRules() => " + r +
1631 ", expected " + exp);
1632 } else {
1633 logln((UnicodeString)"OK: toRules() => " + r);
1634 }
1635 delete t;
1636
1637 // Round trip the result of toRules
1638 t = Transliterator::createFromRules("Test", r, UTRANS_FORWARD, parseError, status);
1639 if (t == 0) {
1640 errln("FAIL: createFromRules #2 failed");
1641 return;
1642 } else {
1643 logln((UnicodeString)"OK: createFromRules(" + r + ") succeeded");
1644 }
1645
1646 // Test toRules again
1647 t->toRules(r, TRUE);
1648 if (r != exp) {
1649 errln((UnicodeString)"FAIL: toRules() => " + r +
1650 ", expected " + exp);
1651 } else {
1652 logln((UnicodeString)"OK: toRules() => " + r);
1653 }
1654
1655 delete t;
1656
1657 // Test Foo(Bar) IDs. Careful with spacing in id; make it conform
1658 // to what the regenerated ID will look like.
1659 UnicodeString id("Upper(Lower);(NFKC)", "");
1660 t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
1661 if (t == 0) {
1662 errln("FAIL: createInstance #2 failed");
1663 return;
1664 }
1665 if (t->getID() == id) {
1666 logln((UnicodeString)"OK: created " + id);
1667 } else {
1668 errln((UnicodeString)"FAIL: createInstance(" + id +
1669 ").getID() => " + t->getID());
1670 }
1671
1672 Transliterator *u = t->createInverse(status);
1673 if (u == 0) {
1674 errln("FAIL: createInverse failed");
1675 delete t;
1676 return;
1677 }
1678 exp = "NFKC();Lower(Upper)";
1679 if (u->getID() == exp) {
1680 logln((UnicodeString)"OK: createInverse(" + id + ") => " +
1681 u->getID());
1682 } else {
1683 errln((UnicodeString)"FAIL: createInverse(" + id + ") => " +
1684 u->getID());
1685 }
1686 delete t;
1687 delete u;
1688 }
1689
1690 /**
1691 * Compound filter semantics were orginially not implemented
1692 * correctly. Originally, each component filter f(i) is replaced by
1693 * f'(i) = f(i) && g, where g is the filter for the compound
1694 * transliterator.
1695 *
1696 * From Mark:
1697 *
1698 * Suppose and I have a transliterator X. Internally X is
1699 * "Greek-Latin; Latin-Cyrillic; Any-Lower". I use a filter [^A].
1700 *
1701 * The compound should convert all greek characters (through latin) to
1702 * cyrillic, then lowercase the result. The filter should say "don't
1703 * touch 'A' in the original". But because an intermediate result
1704 * happens to go through "A", the Greek Alpha gets hung up.
1705 */
TestCompoundFilter(void)1706 void TransliteratorTest::TestCompoundFilter(void) {
1707 UParseError parseError;
1708 UErrorCode status = U_ZERO_ERROR;
1709 Transliterator *t = Transliterator::createInstance
1710 ("Greek-Latin; Latin-Greek; Lower", UTRANS_FORWARD, parseError, status);
1711 if (t == 0) {
1712 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1713 return;
1714 }
1715 t->adoptFilter(new UnicodeSet("[^A]", status));
1716 if (U_FAILURE(status)) {
1717 errln("FAIL: UnicodeSet ct failed");
1718 delete t;
1719 return;
1720 }
1721
1722 // Only the 'A' at index 1 should remain unchanged
1723 expect(*t,
1724 CharsToUnicodeString("BA\\u039A\\u0391"),
1725 CharsToUnicodeString("\\u03b2A\\u03ba\\u03b1"));
1726 delete t;
1727 }
1728
TestRemove(void)1729 void TransliteratorTest::TestRemove(void) {
1730 UParseError parseError;
1731 UErrorCode status = U_ZERO_ERROR;
1732 Transliterator *t = Transliterator::createInstance("Remove[abc]", UTRANS_FORWARD, parseError, status);
1733 if (t == 0) {
1734 errln("FAIL: createInstance failed");
1735 return;
1736 }
1737
1738 expect(*t, "Able bodied baker's cats", "Ale odied ker's ts");
1739
1740 // extra test for RemoveTransliterator::clone(), which at one point wasn't
1741 // duplicating the filter
1742 Transliterator* t2 = t->clone();
1743 expect(*t2, "Able bodied baker's cats", "Ale odied ker's ts");
1744
1745 delete t;
1746 delete t2;
1747 }
1748
TestToRules(void)1749 void TransliteratorTest::TestToRules(void) {
1750 const char* RBT = "rbt";
1751 const char* SET = "set";
1752 static const char* DATA[] = {
1753 RBT,
1754 "$a=\\u4E61; [$a] > A;",
1755 "[\\u4E61] > A;",
1756
1757 RBT,
1758 "$white=[[:Zs:][:Zl:]]; $white{a} > A;",
1759 "[[:Zs:][:Zl:]]{a} > A;",
1760
1761 SET,
1762 "[[:Zs:][:Zl:]]",
1763 "[[:Zs:][:Zl:]]",
1764
1765 SET,
1766 "[:Ps:]",
1767 "[:Ps:]",
1768
1769 SET,
1770 "[:L:]",
1771 "[:L:]",
1772
1773 SET,
1774 "[[:L:]-[A]]",
1775 "[[:L:]-[A]]",
1776
1777 SET,
1778 "[~[:Lu:][:Ll:]]",
1779 "[~[:Lu:][:Ll:]]",
1780
1781 SET,
1782 "[~[a-z]]",
1783 "[~[a-z]]",
1784
1785 RBT,
1786 "$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
1787 "[^[:Zs:]]{a} > A;",
1788
1789 RBT,
1790 "$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
1791 "[[a-z]-[:Zs:]]{a} > A;",
1792
1793 RBT,
1794 "$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
1795 "[[:Zs:]&[a-z]]{a} > A;",
1796
1797 RBT,
1798 "$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
1799 "[x[:Zs:]]{a} > A;",
1800
1801 RBT,
1802 "$accentMinus = [ [\\u0300-\\u0345] & [:M:] - [\\u0338]] ;"
1803 "$macron = \\u0304 ;"
1804 "$evowel = [aeiouyAEIOUY] ;"
1805 "$iotasub = \\u0345 ;"
1806 "($evowel $macron $accentMinus *) i > | $1 $iotasub ;",
1807 "([AEIOUYaeiouy]\\u0304[[\\u0300-\\u0345]&[:M:]-[\\u0338]]*)i > | $1 \\u0345;",
1808
1809 RBT,
1810 "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1811 "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1812 };
1813 static const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1814
1815 for (int32_t d=0; d < DATA_length; d+=3) {
1816 if (DATA[d] == RBT) {
1817 // Transliterator test
1818 UParseError parseError;
1819 UErrorCode status = U_ZERO_ERROR;
1820 Transliterator *t = Transliterator::createFromRules("ID",
1821 UnicodeString(DATA[d+1], -1, US_INV), UTRANS_FORWARD, parseError, status);
1822 if (t == 0) {
1823 dataerrln("FAIL: createFromRules failed - %s", u_errorName(status));
1824 return;
1825 }
1826 UnicodeString rules, escapedRules;
1827 t->toRules(rules, FALSE);
1828 t->toRules(escapedRules, TRUE);
1829 UnicodeString expRules = CharsToUnicodeString(DATA[d+2]);
1830 UnicodeString expEscapedRules(DATA[d+2], -1, US_INV);
1831 if (rules == expRules) {
1832 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1833 " => " + rules);
1834 } else {
1835 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1836 " => " + rules + ", exp " + expRules);
1837 }
1838 if (escapedRules == expEscapedRules) {
1839 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1840 " => " + escapedRules);
1841 } else {
1842 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1843 " => " + escapedRules + ", exp " + expEscapedRules);
1844 }
1845 delete t;
1846
1847 } else {
1848 // UnicodeSet test
1849 UErrorCode status = U_ZERO_ERROR;
1850 UnicodeString pat(DATA[d+1], -1, US_INV);
1851 UnicodeString expToPat(DATA[d+2], -1, US_INV);
1852 UnicodeSet set(pat, status);
1853 if (U_FAILURE(status)) {
1854 errln("FAIL: UnicodeSet ct failed");
1855 return;
1856 }
1857 // Adjust spacing etc. as necessary.
1858 UnicodeString toPat;
1859 set.toPattern(toPat);
1860 if (expToPat == toPat) {
1861 logln((UnicodeString)"Ok: " + pat +
1862 " => " + toPat);
1863 } else {
1864 errln((UnicodeString)"FAIL: " + pat +
1865 " => " + prettify(toPat, TRUE) +
1866 ", exp " + prettify(pat, TRUE));
1867 }
1868 }
1869 }
1870 }
1871
TestContext()1872 void TransliteratorTest::TestContext() {
1873 UTransPosition pos = {0, 2, 0, 1}; // cs cl s l
1874 expect("de > x; {d}e > y;",
1875 "de",
1876 "ye",
1877 &pos);
1878
1879 expect("ab{c} > z;",
1880 "xadabdabcy",
1881 "xadabdabzy");
1882 }
1883
TestSupplemental()1884 void TransliteratorTest::TestSupplemental() {
1885
1886 expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];"
1887 "a > $a; $s > i;"),
1888 CharsToUnicodeString("ab\\U0001030Fx"),
1889 CharsToUnicodeString("\\U00010300bix"));
1890
1891 expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];"
1892 "$b=[A-Z\\U00010400-\\U0001044D];"
1893 "($a)($b) > $2 $1;"),
1894 CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
1895 CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
1896
1897 // k|ax\\U00010300xm
1898
1899 // k|a\\U00010400\\U00010300xm
1900 // ky|\\U00010400\\U00010300xm
1901 // ky\\U00010400|\\U00010300xm
1902
1903 // ky\\U00010400|\\U00010300\\U00010400m
1904 // ky\\U00010400y|\\U00010400m
1905 expect(CharsToUnicodeString("$a=[a\\U00010300-\\U00010323];"
1906 "$a {x} > | @ \\U00010400;"
1907 "{$a} [^\\u0000-\\uFFFF] > y;"),
1908 CharsToUnicodeString("kax\\U00010300xm"),
1909 CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
1910
1911 expectT("Any-Name",
1912 CharsToUnicodeString("\\U00010330\\U000E0061\\u00A0"),
1913 UNICODE_STRING_SIMPLE("\\N{GOTHIC LETTER AHSA}\\N{TAG LATIN SMALL LETTER A}\\N{NO-BREAK SPACE}"));
1914
1915 expectT("Any-Hex/Unicode",
1916 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1917 UNICODE_STRING_SIMPLE("U+10330U+10FF00U+E0061U+00A0"));
1918
1919 expectT("Any-Hex/C",
1920 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1921 UNICODE_STRING_SIMPLE("\\U00010330\\U0010FF00\\U000E0061\\u00A0"));
1922
1923 expectT("Any-Hex/Perl",
1924 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1925 UNICODE_STRING_SIMPLE("\\x{10330}\\x{10FF00}\\x{E0061}\\x{A0}"));
1926
1927 expectT("Any-Hex/Java",
1928 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1929 UNICODE_STRING_SIMPLE("\\uD800\\uDF30\\uDBFF\\uDF00\\uDB40\\uDC61\\u00A0"));
1930
1931 expectT("Any-Hex/XML",
1932 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1933 "𐌰􏼀󠁡 ");
1934
1935 expectT("Any-Hex/XML10",
1936 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1937 "𐌰􏼀󠁡 ");
1938
1939 expectT(UNICODE_STRING_SIMPLE("[\\U000E0000-\\U000E0FFF] Remove"),
1940 CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1941 CharsToUnicodeString("\\U00010330\\U0010FF00\\u00A0"));
1942 }
1943
TestQuantifier()1944 void TransliteratorTest::TestQuantifier() {
1945
1946 // Make sure @ in a quantified anteContext works
1947 expect("a+ {b} > | @@ c; A > a; (a+ c) > '(' $1 ')';",
1948 "AAAAAb",
1949 "aaa(aac)");
1950
1951 // Make sure @ in a quantified postContext works
1952 expect("{b} a+ > c @@ |; (a+) > '(' $1 ')';",
1953 "baaaaa",
1954 "caa(aaa)");
1955
1956 // Make sure @ in a quantified postContext with seg ref works
1957 expect("{(b)} a+ > $1 @@ |; (a+) > '(' $1 ')';",
1958 "baaaaa",
1959 "baa(aaa)");
1960
1961 // Make sure @ past ante context doesn't enter ante context
1962 UTransPosition pos = {0, 5, 3, 5};
1963 expect("a+ {b} > | @@ c; x > y; (a+ c) > '(' $1 ')';",
1964 "xxxab",
1965 "xxx(ac)",
1966 &pos);
1967
1968 // Make sure @ past post context doesn't pass limit
1969 UTransPosition pos2 = {0, 4, 0, 2};
1970 expect("{b} a+ > c @@ |; x > y; a > A;",
1971 "baxx",
1972 "caxx",
1973 &pos2);
1974
1975 // Make sure @ past post context doesn't enter post context
1976 expect("{b} a+ > c @@ |; x > y; a > A;",
1977 "baxx",
1978 "cayy");
1979
1980 expect("(ab)? c > d;",
1981 "c abc ababc",
1982 "d d abd");
1983
1984 // NOTE: The (ab)+ when referenced just yields a single "ab",
1985 // not the full sequence of them. This accords with perl behavior.
1986 expect("(ab)+ {x} > '(' $1 ')';",
1987 "x abx ababxy",
1988 "x ab(ab) abab(ab)y");
1989
1990 expect("b+ > x;",
1991 "ac abc abbc abbbc",
1992 "ac axc axc axc");
1993
1994 expect("[abc]+ > x;",
1995 "qac abrc abbcs abtbbc",
1996 "qx xrx xs xtx");
1997
1998 expect("q{(ab)+} > x;",
1999 "qa qab qaba qababc qaba",
2000 "qa qx qxa qxc qxa");
2001
2002 expect("q(ab)* > x;",
2003 "qa qab qaba qababc",
2004 "xa x xa xc");
2005
2006 // NOTE: The (ab)+ when referenced just yields a single "ab",
2007 // not the full sequence of them. This accords with perl behavior.
2008 expect("q(ab)* > '(' $1 ')';",
2009 "qa qab qaba qababc",
2010 "()a (ab) (ab)a (ab)c");
2011
2012 // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
2013 // quoted string
2014 expect("'ab'+ > x;",
2015 "bb ab ababb",
2016 "bb x xb");
2017
2018 // $foo+ and $foo* -- the quantifier should apply to the entire
2019 // variable reference
2020 expect("$var = ab; $var+ > x;",
2021 "bb ab ababb",
2022 "bb x xb");
2023 }
2024
2025 class TestTrans : public Transliterator {
2026 public:
TestTrans(const UnicodeString & id)2027 TestTrans(const UnicodeString& id) : Transliterator(id, 0) {
2028 }
clone(void) const2029 virtual TestTrans* clone(void) const {
2030 return new TestTrans(getID());
2031 }
handleTransliterate(Replaceable &,UTransPosition & offsets,UBool) const2032 virtual void handleTransliterate(Replaceable& /*text*/, UTransPosition& offsets,
2033 UBool /*isIncremental*/) const
2034 {
2035 offsets.start = offsets.limit;
2036 }
2037 virtual UClassID getDynamicClassID() const;
2038 static UClassID U_EXPORT2 getStaticClassID();
2039 };
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)2040 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)
2041
2042 /**
2043 * Test Source-Target/Variant.
2044 */
2045 void TransliteratorTest::TestSTV(void) {
2046 int32_t ns = Transliterator::countAvailableSources();
2047 if (ns < 0 || ns > 255) {
2048 errln((UnicodeString)"FAIL: Bad source count: " + ns);
2049 return;
2050 }
2051 int32_t i, j;
2052 for (i=0; i<ns; ++i) {
2053 UnicodeString source;
2054 Transliterator::getAvailableSource(i, source);
2055 logln((UnicodeString)"" + i + ": " + source);
2056 if (source.length() == 0) {
2057 errln("FAIL: empty source");
2058 continue;
2059 }
2060 int32_t nt = Transliterator::countAvailableTargets(source);
2061 if (nt < 0 || nt > 255) {
2062 errln((UnicodeString)"FAIL: Bad target count: " + nt);
2063 continue;
2064 }
2065 for (int32_t j=0; j<nt; ++j) {
2066 UnicodeString target;
2067 Transliterator::getAvailableTarget(j, source, target);
2068 logln((UnicodeString)" " + j + ": " + target);
2069 if (target.length() == 0) {
2070 errln("FAIL: empty target");
2071 continue;
2072 }
2073 int32_t nv = Transliterator::countAvailableVariants(source, target);
2074 if (nv < 0 || nv > 255) {
2075 errln((UnicodeString)"FAIL: Bad variant count: " + nv);
2076 continue;
2077 }
2078 for (int32_t k=0; k<nv; ++k) {
2079 UnicodeString variant;
2080 Transliterator::getAvailableVariant(k, source, target, variant);
2081 if (variant.length() == 0) {
2082 logln((UnicodeString)" " + k + ": <empty>");
2083 } else {
2084 logln((UnicodeString)" " + k + ": " + variant);
2085 }
2086 }
2087 }
2088 }
2089
2090 // Test registration
2091 const char* IDS[] = { "Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2092 const char* FULL_IDS[] = { "Any-Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2093 const char* SOURCES[] = { NULL, "Seoridf", "Oewoir" };
2094 for (i=0; i<3; ++i) {
2095 Transliterator *t = new TestTrans(IDS[i]);
2096 if (t == 0) {
2097 errln("FAIL: out of memory");
2098 return;
2099 }
2100 if (t->getID() != IDS[i]) {
2101 errln((UnicodeString)"FAIL: ID mismatch for " + IDS[i]);
2102 delete t;
2103 return;
2104 }
2105 Transliterator::registerInstance(t);
2106 UErrorCode status = U_ZERO_ERROR;
2107 t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2108 if (t == NULL) {
2109 errln((UnicodeString)"FAIL: Registration/creation failed for ID " +
2110 IDS[i]);
2111 } else {
2112 logln((UnicodeString)"Ok: Registration/creation succeeded for ID " +
2113 IDS[i]);
2114 delete t;
2115 }
2116 Transliterator::unregister(IDS[i]);
2117 t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2118 if (t != NULL) {
2119 errln((UnicodeString)"FAIL: Unregistration failed for ID " +
2120 IDS[i]);
2121 delete t;
2122 }
2123 }
2124
2125 // Make sure getAvailable API reflects removal
2126 int32_t n = Transliterator::countAvailableIDs();
2127 for (i=0; i<n; ++i) {
2128 UnicodeString id = Transliterator::getAvailableID(i);
2129 for (j=0; j<3; ++j) {
2130 if (id.caseCompare(FULL_IDS[j],0)==0) {
2131 errln((UnicodeString)"FAIL: unregister(" + id + ") failed");
2132 }
2133 }
2134 }
2135 n = Transliterator::countAvailableTargets("Any");
2136 for (i=0; i<n; ++i) {
2137 UnicodeString t;
2138 Transliterator::getAvailableTarget(i, "Any", t);
2139 if (t.caseCompare(IDS[0],0)==0) {
2140 errln((UnicodeString)"FAIL: unregister(Any-" + t + ") failed");
2141 }
2142 }
2143 n = Transliterator::countAvailableSources();
2144 for (i=0; i<n; ++i) {
2145 UnicodeString s;
2146 Transliterator::getAvailableSource(i, s);
2147 for (j=0; j<3; ++j) {
2148 if (SOURCES[j] == NULL) continue;
2149 if (s.caseCompare(SOURCES[j],0)==0) {
2150 errln((UnicodeString)"FAIL: unregister(" + s + "-*) failed");
2151 }
2152 }
2153 }
2154 }
2155
2156 /**
2157 * Test inverse of Greek-Latin; Title()
2158 */
TestCompoundInverse(void)2159 void TransliteratorTest::TestCompoundInverse(void) {
2160 UParseError parseError;
2161 UErrorCode status = U_ZERO_ERROR;
2162 Transliterator *t = Transliterator::createInstance
2163 ("Greek-Latin; Title()", UTRANS_REVERSE,parseError, status);
2164 if (t == 0) {
2165 dataerrln("FAIL: createInstance - %s", u_errorName(status));
2166 return;
2167 }
2168 UnicodeString exp("(Title);Latin-Greek");
2169 if (t->getID() == exp) {
2170 logln("Ok: inverse of \"Greek-Latin; Title()\" is \"" +
2171 t->getID());
2172 } else {
2173 errln("FAIL: inverse of \"Greek-Latin; Title()\" is \"" +
2174 t->getID() + "\", expected \"" + exp + "\"");
2175 }
2176 delete t;
2177 }
2178
2179 /**
2180 * Test NFD chaining with RBT
2181 */
TestNFDChainRBT()2182 void TransliteratorTest::TestNFDChainRBT() {
2183 UParseError pe;
2184 UErrorCode ec = U_ZERO_ERROR;
2185 Transliterator* t = Transliterator::createFromRules(
2186 "TEST", "::NFD; aa > Q; a > q;",
2187 UTRANS_FORWARD, pe, ec);
2188 if (t == NULL || U_FAILURE(ec)) {
2189 dataerrln("FAIL: Transliterator::createFromRules failed with %s", u_errorName(ec));
2190 return;
2191 }
2192 expect(*t, "aa", "Q");
2193 delete t;
2194
2195 // TEMPORARY TESTS -- BEING DEBUGGED
2196 //=- UnicodeString s, s2;
2197 //=- t = Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, pe, ec);
2198 //=- s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2199 //=- s2 = CharsToUnicodeString("\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D");
2200 //=- expect(*t, s, s2);
2201 //=- delete t;
2202 //=-
2203 //=- t = Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2204 //=- expect(*t, s2, s);
2205 //=- delete t;
2206 //=-
2207 //=- t = Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2208 //=- s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2209 //=- expect(*t, s, s);
2210 //=- delete t;
2211
2212 // const char* source[] = {
2213 // /*
2214 // "\\u015Br\\u012Bmad",
2215 // "bhagavadg\\u012Bt\\u0101",
2216 // "adhy\\u0101ya",
2217 // "arjuna",
2218 // "vi\\u1E63\\u0101da",
2219 // "y\\u014Dga",
2220 // "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2221 // "uv\\u0101cr\\u0325",
2222 // */
2223 // "rmk\\u1E63\\u0113t",
2224 // //"dharmak\\u1E63\\u0113tr\\u0113",
2225 // /*
2226 // "kuruk\\u1E63\\u0113tr\\u0113",
2227 // "samav\\u0113t\\u0101",
2228 // "yuyutsava-\\u1E25",
2229 // "m\\u0101mak\\u0101-\\u1E25",
2230 // // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2231 // "kimakurvata",
2232 // "san\\u0304java",
2233 // */
2234 //
2235 // 0
2236 // };
2237 // const char* expected[] = {
2238 // /*
2239 // "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2240 // "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2241 // "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2242 // "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2243 // "\\u0935\\u093f\\u0937\\u093e\\u0926",
2244 // "\\u092f\\u094b\\u0917",
2245 // "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2246 // "\\u0909\\u0935\\u093E\\u091A\\u0943",
2247 // */
2248 // "\\u0927",
2249 // //"\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2250 // /*
2251 // "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2252 // "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2253 // "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2254 // "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2255 // // "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2256 // "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2257 // "\\u0938\\u0902\\u091c\\u0935",
2258 // */
2259 // 0
2260 // };
2261 // UErrorCode status = U_ZERO_ERROR;
2262 // UParseError parseError;
2263 // UnicodeString message;
2264 // Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2265 // Transliterator* devToLatinToDev=Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2266 // if(U_FAILURE(status)){
2267 // errln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2268 // errln("PreContext: " + prettify(parseError.preContext) + "PostContext: " + prettify( parseError.postContext) );
2269 // delete latinToDevToLatin;
2270 // delete devToLatinToDev;
2271 // return;
2272 // }
2273 // UnicodeString gotResult;
2274 // for(int i= 0; source[i] != 0; i++){
2275 // gotResult = source[i];
2276 // expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2277 // expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2278 // }
2279 // delete latinToDevToLatin;
2280 // delete devToLatinToDev;
2281 }
2282
2283 /**
2284 * Inverse of "Null" should be "Null". (J21)
2285 */
TestNullInverse()2286 void TransliteratorTest::TestNullInverse() {
2287 UParseError pe;
2288 UErrorCode ec = U_ZERO_ERROR;
2289 Transliterator *t = Transliterator::createInstance("Null", UTRANS_FORWARD, pe, ec);
2290 if (t == 0 || U_FAILURE(ec)) {
2291 errln("FAIL: createInstance");
2292 return;
2293 }
2294 Transliterator *u = t->createInverse(ec);
2295 if (u == 0 || U_FAILURE(ec)) {
2296 errln("FAIL: createInverse");
2297 delete t;
2298 return;
2299 }
2300 if (u->getID() != "Null") {
2301 errln("FAIL: Inverse of Null should be Null");
2302 }
2303 delete t;
2304 delete u;
2305 }
2306
2307 /**
2308 * Check ID of inverse of alias. (J22)
2309 */
TestAliasInverseID()2310 void TransliteratorTest::TestAliasInverseID() {
2311 UnicodeString ID("Latin-Hangul", ""); // This should be any alias ID with an inverse
2312 UParseError pe;
2313 UErrorCode ec = U_ZERO_ERROR;
2314 Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2315 if (t == 0 || U_FAILURE(ec)) {
2316 dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2317 return;
2318 }
2319 Transliterator *u = t->createInverse(ec);
2320 if (u == 0 || U_FAILURE(ec)) {
2321 errln("FAIL: createInverse");
2322 delete t;
2323 return;
2324 }
2325 UnicodeString exp = "Hangul-Latin";
2326 UnicodeString got = u->getID();
2327 if (got != exp) {
2328 errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2329 ", expected " + exp);
2330 }
2331 delete t;
2332 delete u;
2333 }
2334
2335 /**
2336 * Test IDs of inverses of compound transliterators. (J20)
2337 */
TestCompoundInverseID()2338 void TransliteratorTest::TestCompoundInverseID() {
2339 UnicodeString ID = "Latin-Jamo;NFC(NFD)";
2340 UParseError pe;
2341 UErrorCode ec = U_ZERO_ERROR;
2342 Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2343 if (t == 0 || U_FAILURE(ec)) {
2344 dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2345 return;
2346 }
2347 Transliterator *u = t->createInverse(ec);
2348 if (u == 0 || U_FAILURE(ec)) {
2349 errln("FAIL: createInverse");
2350 delete t;
2351 return;
2352 }
2353 UnicodeString exp = "NFD(NFC);Jamo-Latin";
2354 UnicodeString got = u->getID();
2355 if (got != exp) {
2356 errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2357 ", expected " + exp);
2358 }
2359 delete t;
2360 delete u;
2361 }
2362
2363 /**
2364 * Test undefined variable.
2365
2366 */
TestUndefinedVariable()2367 void TransliteratorTest::TestUndefinedVariable() {
2368 UnicodeString rule = "$initial } a <> \\u1161;";
2369 UParseError pe;
2370 UErrorCode ec = U_ZERO_ERROR;
2371 Transliterator *t = Transliterator::createFromRules("<ID>", rule, UTRANS_FORWARD, pe, ec);
2372 delete t;
2373 if (U_FAILURE(ec)) {
2374 logln((UnicodeString)"OK: Got exception for " + rule + ", as expected: " +
2375 u_errorName(ec));
2376 return;
2377 }
2378 errln((UnicodeString)"Fail: bogus rule " + rule + " compiled with error " +
2379 u_errorName(ec));
2380 }
2381
2382 /**
2383 * Test empty context.
2384 */
TestEmptyContext()2385 void TransliteratorTest::TestEmptyContext() {
2386 expect(" { a } > b;", "xay a ", "xby b ");
2387 }
2388
2389 /**
2390 * Test compound filter ID syntax
2391 */
TestCompoundFilterID(void)2392 void TransliteratorTest::TestCompoundFilterID(void) {
2393 static const char* DATA[] = {
2394 // Col. 1 = ID or rule set (latter must start with #)
2395
2396 // = columns > 1 are null if expect col. 1 to be illegal =
2397
2398 // Col. 2 = direction, "F..." or "R..."
2399 // Col. 3 = source string
2400 // Col. 4 = exp result
2401
2402 "[abc]; [abc]", NULL, NULL, NULL, // multiple filters
2403 "Latin-Greek; [abc];", NULL, NULL, NULL, // misplaced filter
2404 "[b]; Latin-Greek; Upper; ([xyz])", "F", "abc", "a\\u0392c",
2405 "[b]; (Lower); Latin-Greek; Upper(); ([\\u0392])", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2406 "#\n::[b]; ::Latin-Greek; ::Upper; ::([xyz]);", "F", "abc", "a\\u0392c",
2407 "#\n::[b]; ::(Lower); ::Latin-Greek; ::Upper(); ::([\\u0392]);", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2408 NULL,
2409 };
2410
2411 for (int32_t i=0; DATA[i]; i+=4) {
2412 UnicodeString id = CharsToUnicodeString(DATA[i]);
2413 UTransDirection direction = (DATA[i+1] != NULL && DATA[i+1][0] == 'R') ?
2414 UTRANS_REVERSE : UTRANS_FORWARD;
2415 UnicodeString source;
2416 UnicodeString exp;
2417 if (DATA[i+2] != NULL) {
2418 source = CharsToUnicodeString(DATA[i+2]);
2419 exp = CharsToUnicodeString(DATA[i+3]);
2420 }
2421 UBool expOk = (DATA[i+1] != NULL);
2422 LocalPointer<Transliterator> t;
2423 UParseError pe;
2424 UErrorCode ec = U_ZERO_ERROR;
2425 if (id.charAt(0) == 0x23/*#*/) {
2426 t.adoptInstead(Transliterator::createFromRules("ID", id, direction, pe, ec));
2427 } else {
2428 t.adoptInstead(Transliterator::createInstance(id, direction, pe, ec));
2429 }
2430 UBool ok = (t.isValid() && U_SUCCESS(ec));
2431 UnicodeString transID;
2432 if (t.isValid()) {
2433 transID = t->getID();
2434 }
2435 else {
2436 transID = UnicodeString("NULL", "");
2437 }
2438 if (ok == expOk) {
2439 logln((UnicodeString)"Ok: " + id + " => " + transID + ", " +
2440 u_errorName(ec));
2441 if (source.length() != 0) {
2442 expect(*t, source, exp);
2443 }
2444 } else {
2445 dataerrln((UnicodeString)"FAIL: " + id + " => " + transID + ", " +
2446 u_errorName(ec));
2447 }
2448 }
2449 }
2450
2451 /**
2452 * Test new property set syntax
2453 */
TestPropertySet()2454 void TransliteratorTest::TestPropertySet() {
2455 expect(UNICODE_STRING_SIMPLE("a>A; \\p{Lu}>x; \\p{ANY}>y;"), "abcDEF", "Ayyxxx");
2456 expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
2457 "[ a stitch ]\n[ in time ]\r[ saves 9]");
2458 }
2459
2460 /**
2461 * Test various failure points of the new 2.0 engine.
2462 */
TestNewEngine()2463 void TransliteratorTest::TestNewEngine() {
2464 UParseError pe;
2465 UErrorCode ec = U_ZERO_ERROR;
2466 Transliterator *t = Transliterator::createInstance("Latin-Hiragana", UTRANS_FORWARD, pe, ec);
2467 if (t == 0 || U_FAILURE(ec)) {
2468 dataerrln("FAIL: createInstance Latin-Hiragana - %s", u_errorName(ec));
2469 return;
2470 }
2471 // Katakana should be untouched
2472 expect(*t, CharsToUnicodeString("a\\u3042\\u30A2"),
2473 CharsToUnicodeString("\\u3042\\u3042\\u30A2"));
2474
2475 delete t;
2476
2477 #if 1
2478 // This test will only work if Transliterator.ROLLBACK is
2479 // true. Otherwise, this test will fail, revealing a
2480 // limitation of global filters in incremental mode.
2481 Transliterator *a =
2482 Transliterator::createFromRules("a_to_A", "a > A;", UTRANS_FORWARD, pe, ec);
2483 Transliterator *A =
2484 Transliterator::createFromRules("A_to_b", "A > b;", UTRANS_FORWARD, pe, ec);
2485 if (U_FAILURE(ec)) {
2486 delete a;
2487 delete A;
2488 return;
2489 }
2490
2491 Transliterator* array[3];
2492 array[0] = a;
2493 array[1] = Transliterator::createInstance("NFD", UTRANS_FORWARD, pe, ec);
2494 array[2] = A;
2495 if (U_FAILURE(ec)) {
2496 errln("FAIL: createInstance NFD");
2497 delete a;
2498 delete A;
2499 delete array[1];
2500 return;
2501 }
2502
2503 t = new CompoundTransliterator(array, 3, new UnicodeSet("[:Ll:]", ec));
2504 if (U_FAILURE(ec)) {
2505 errln("FAIL: UnicodeSet constructor");
2506 delete a;
2507 delete A;
2508 delete array[1];
2509 delete t;
2510 return;
2511 }
2512
2513 expect(*t, "aAaA", "bAbA");
2514
2515 assertTrue("countElements", t->countElements() == 3);
2516 assertEquals("getElement(0)", t->getElement(0, ec).getID(), "a_to_A");
2517 assertEquals("getElement(1)", t->getElement(1, ec).getID(), "NFD");
2518 assertEquals("getElement(2)", t->getElement(2, ec).getID(), "A_to_b");
2519 assertSuccess("getElement", ec);
2520
2521 delete a;
2522 delete A;
2523 delete array[1];
2524 delete t;
2525 #endif
2526
2527 expect("$smooth = x; $macron = q; [:^L:] { ([aeiouyAEIOUY] $macron?) } [^aeiouyAEIOUY$smooth$macron] > | $1 $smooth ;",
2528 "a",
2529 "ax");
2530
2531 UnicodeString gr = CharsToUnicodeString(
2532 "$ddot = \\u0308 ;"
2533 "$lcgvowel = [\\u03b1\\u03b5\\u03b7\\u03b9\\u03bf\\u03c5\\u03c9] ;"
2534 "$rough = \\u0314 ;"
2535 "($lcgvowel+ $ddot?) $rough > h | $1 ;"
2536 "\\u03b1 <> a ;"
2537 "$rough <> h ;");
2538
2539 expect(gr, CharsToUnicodeString("\\u03B1\\u0314"), "ha");
2540 }
2541
2542 /**
2543 * Test quantified segment behavior. We want:
2544 * ([abc])+ > x $1 x; applied to "cba" produces "xax"
2545 */
TestQuantifiedSegment(void)2546 void TransliteratorTest::TestQuantifiedSegment(void) {
2547 // The normal case
2548 expect("([abc]+) > x $1 x;", "cba", "xcbax");
2549
2550 // The tricky case; the quantifier is around the segment
2551 expect("([abc])+ > x $1 x;", "cba", "xax");
2552
2553 // Tricky case in reverse direction
2554 expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
2555
2556 // Check post-context segment
2557 expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
2558
2559 // Test toRule/toPattern for non-quantified segment.
2560 // Careful with spacing here.
2561 UnicodeString r("([a-c]){q} > x $1 x;");
2562 UParseError pe;
2563 UErrorCode ec = U_ZERO_ERROR;
2564 Transliterator* t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2565 if (U_FAILURE(ec)) {
2566 errln("FAIL: createFromRules");
2567 delete t;
2568 return;
2569 }
2570 UnicodeString rr;
2571 t->toRules(rr, TRUE);
2572 if (r != rr) {
2573 errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2574 } else {
2575 logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2576 }
2577 delete t;
2578
2579 // Test toRule/toPattern for quantified segment.
2580 // Careful with spacing here.
2581 r = "([a-c])+{q} > x $1 x;";
2582 t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2583 if (U_FAILURE(ec)) {
2584 errln("FAIL: createFromRules");
2585 delete t;
2586 return;
2587 }
2588 t->toRules(rr, TRUE);
2589 if (r != rr) {
2590 errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2591 } else {
2592 logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2593 }
2594 delete t;
2595 }
2596
2597 //======================================================================
2598 // Ram's tests
2599 //======================================================================
TestDevanagariLatinRT()2600 void TransliteratorTest::TestDevanagariLatinRT(){
2601 const int MAX_LEN= 52;
2602 const char* const source[MAX_LEN] = {
2603 "bh\\u0101rata",
2604 "kra",
2605 "k\\u1E63a",
2606 "khra",
2607 "gra",
2608 "\\u1E45ra",
2609 "cra",
2610 "chra",
2611 "j\\u00F1a",
2612 "jhra",
2613 "\\u00F1ra",
2614 "\\u1E6Dya",
2615 "\\u1E6Dhra",
2616 "\\u1E0Dya",
2617 //"r\\u0323ya", // \u095c is not valid in Devanagari
2618 "\\u1E0Dhya",
2619 "\\u1E5Bhra",
2620 "\\u1E47ra",
2621 "tta",
2622 "thra",
2623 "dda",
2624 "dhra",
2625 "nna",
2626 "pra",
2627 "phra",
2628 "bra",
2629 "bhra",
2630 "mra",
2631 "\\u1E49ra",
2632 //"l\\u0331ra",
2633 "yra",
2634 "\\u1E8Fra",
2635 //"l-",
2636 "vra",
2637 "\\u015Bra",
2638 "\\u1E63ra",
2639 "sra",
2640 "hma",
2641 "\\u1E6D\\u1E6Da",
2642 "\\u1E6D\\u1E6Dha",
2643 "\\u1E6Dh\\u1E6Dha",
2644 "\\u1E0D\\u1E0Da",
2645 "\\u1E0D\\u1E0Dha",
2646 "\\u1E6Dya",
2647 "\\u1E6Dhya",
2648 "\\u1E0Dya",
2649 "\\u1E0Dhya",
2650 // Not roundtrippable --
2651 // \\u0939\\u094d\\u094d\\u092E - hma
2652 // \\u0939\\u094d\\u092E - hma
2653 // CharsToUnicodeString("hma"),
2654 "hya",
2655 "\\u015Br\\u0325",
2656 "\\u015Bca",
2657 "\\u0115",
2658 "san\\u0304j\\u012Bb s\\u0113nagupta",
2659 "\\u0101nand vaddir\\u0101ju",
2660 "\\u0101",
2661 "a"
2662 };
2663 const char* const expected[MAX_LEN] = {
2664 "\\u092D\\u093E\\u0930\\u0924", /* bha\\u0304rata */
2665 "\\u0915\\u094D\\u0930", /* kra */
2666 "\\u0915\\u094D\\u0937", /* ks\\u0323a */
2667 "\\u0916\\u094D\\u0930", /* khra */
2668 "\\u0917\\u094D\\u0930", /* gra */
2669 "\\u0919\\u094D\\u0930", /* n\\u0307ra */
2670 "\\u091A\\u094D\\u0930", /* cra */
2671 "\\u091B\\u094D\\u0930", /* chra */
2672 "\\u091C\\u094D\\u091E", /* jn\\u0303a */
2673 "\\u091D\\u094D\\u0930", /* jhra */
2674 "\\u091E\\u094D\\u0930", /* n\\u0303ra */
2675 "\\u091F\\u094D\\u092F", /* t\\u0323ya */
2676 "\\u0920\\u094D\\u0930", /* t\\u0323hra */
2677 "\\u0921\\u094D\\u092F", /* d\\u0323ya */
2678 //"\\u095C\\u094D\\u092F", /* r\\u0323ya */ // \u095c is not valid in Devanagari
2679 "\\u0922\\u094D\\u092F", /* d\\u0323hya */
2680 "\\u0922\\u093C\\u094D\\u0930", /* r\\u0323hra */
2681 "\\u0923\\u094D\\u0930", /* n\\u0323ra */
2682 "\\u0924\\u094D\\u0924", /* tta */
2683 "\\u0925\\u094D\\u0930", /* thra */
2684 "\\u0926\\u094D\\u0926", /* dda */
2685 "\\u0927\\u094D\\u0930", /* dhra */
2686 "\\u0928\\u094D\\u0928", /* nna */
2687 "\\u092A\\u094D\\u0930", /* pra */
2688 "\\u092B\\u094D\\u0930", /* phra */
2689 "\\u092C\\u094D\\u0930", /* bra */
2690 "\\u092D\\u094D\\u0930", /* bhra */
2691 "\\u092E\\u094D\\u0930", /* mra */
2692 "\\u0929\\u094D\\u0930", /* n\\u0331ra */
2693 //"\\u0934\\u094D\\u0930", /* l\\u0331ra */
2694 "\\u092F\\u094D\\u0930", /* yra */
2695 "\\u092F\\u093C\\u094D\\u0930", /* y\\u0307ra */
2696 //"l-",
2697 "\\u0935\\u094D\\u0930", /* vra */
2698 "\\u0936\\u094D\\u0930", /* s\\u0301ra */
2699 "\\u0937\\u094D\\u0930", /* s\\u0323ra */
2700 "\\u0938\\u094D\\u0930", /* sra */
2701 "\\u0939\\u094d\\u092E", /* hma */
2702 "\\u091F\\u094D\\u091F", /* t\\u0323t\\u0323a */
2703 "\\u091F\\u094D\\u0920", /* t\\u0323t\\u0323ha */
2704 "\\u0920\\u094D\\u0920", /* t\\u0323ht\\u0323ha*/
2705 "\\u0921\\u094D\\u0921", /* d\\u0323d\\u0323a */
2706 "\\u0921\\u094D\\u0922", /* d\\u0323d\\u0323ha */
2707 "\\u091F\\u094D\\u092F", /* t\\u0323ya */
2708 "\\u0920\\u094D\\u092F", /* t\\u0323hya */
2709 "\\u0921\\u094D\\u092F", /* d\\u0323ya */
2710 "\\u0922\\u094D\\u092F", /* d\\u0323hya */
2711 // "hma", /* hma */
2712 "\\u0939\\u094D\\u092F", /* hya */
2713 "\\u0936\\u0943", /* s\\u0301r\\u0325a */
2714 "\\u0936\\u094D\\u091A", /* s\\u0301ca */
2715 "\\u090d", /* e\\u0306 */
2716 "\\u0938\\u0902\\u091C\\u0940\\u092C\\u094D \\u0938\\u0947\\u0928\\u0917\\u0941\\u092A\\u094D\\u0924",
2717 "\\u0906\\u0928\\u0902\\u0926\\u094D \\u0935\\u0926\\u094D\\u0926\\u093F\\u0930\\u093E\\u091C\\u0941",
2718 "\\u0906",
2719 "\\u0905",
2720 };
2721 UErrorCode status = U_ZERO_ERROR;
2722 UParseError parseError;
2723 UnicodeString message;
2724 Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2725 Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2726 if(U_FAILURE(status)){
2727 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2728 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2729 return;
2730 }
2731 UnicodeString gotResult;
2732 for(int i= 0; i<MAX_LEN; i++){
2733 gotResult = source[i];
2734 expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2735 expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2736 }
2737 delete latinToDev;
2738 delete devToLatin;
2739 }
2740
TestTeluguLatinRT()2741 void TransliteratorTest::TestTeluguLatinRT(){
2742 const int MAX_LEN=10;
2743 const char* const source[MAX_LEN] = {
2744 "raghur\\u0101m vi\\u015Bvan\\u0101dha", /* Raghuram Viswanadha */
2745 "\\u0101nand vaddir\\u0101ju", /* Anand Vaddiraju */
2746 "r\\u0101j\\u012Bv ka\\u015Barab\\u0101da", /* Rajeev Kasarabada */
2747 "san\\u0304j\\u012Bv ka\\u015Barab\\u0101da", /* sanjeev kasarabada */
2748 "san\\u0304j\\u012Bb sen'gupta", /* sanjib sengupata */
2749 "amar\\u0113ndra hanum\\u0101nula", /* Amarendra hanumanula */
2750 "ravi kum\\u0101r vi\\u015Bvan\\u0101dha", /* Ravi Kumar Viswanadha */
2751 "\\u0101ditya kandr\\u0113gula", /* Aditya Kandregula */
2752 "\\u015Br\\u012Bdhar ka\\u1E47\\u1E6Dama\\u015Be\\u1E6D\\u1E6Di",/* Shridhar Kantamsetty */
2753 "m\\u0101dhav de\\u015Be\\u1E6D\\u1E6Di" /* Madhav Desetty */
2754 };
2755
2756 const char* const expected[MAX_LEN] = {
2757 "\\u0c30\\u0c18\\u0c41\\u0c30\\u0c3e\\u0c2e\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2758 "\\u0c06\\u0c28\\u0c02\\u0c26\\u0c4d \\u0C35\\u0C26\\u0C4D\\u0C26\\u0C3F\\u0C30\\u0C3E\\u0C1C\\u0C41",
2759 "\\u0c30\\u0c3e\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2760 "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2761 "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c2c\\u0c4d \\u0c38\\u0c46\\u0c28\\u0c4d\\u0c17\\u0c41\\u0c2a\\u0c4d\\u0c24",
2762 "\\u0c05\\u0c2e\\u0c30\\u0c47\\u0c02\\u0c26\\u0c4d\\u0c30 \\u0c39\\u0c28\\u0c41\\u0c2e\\u0c3e\\u0c28\\u0c41\\u0c32",
2763 "\\u0c30\\u0c35\\u0c3f \\u0c15\\u0c41\\u0c2e\\u0c3e\\u0c30\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2764 "\\u0c06\\u0c26\\u0c3f\\u0c24\\u0c4d\\u0c2f \\u0C15\\u0C02\\u0C26\\u0C4D\\u0C30\\u0C47\\u0C17\\u0C41\\u0c32",
2765 "\\u0c36\\u0c4d\\u0c30\\u0c40\\u0C27\\u0C30\\u0C4D \\u0c15\\u0c02\\u0c1f\\u0c2e\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2766 "\\u0c2e\\u0c3e\\u0c27\\u0c35\\u0c4d \\u0c26\\u0c46\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2767 };
2768
2769 UErrorCode status = U_ZERO_ERROR;
2770 UParseError parseError;
2771 UnicodeString message;
2772 Transliterator* latinToDev=Transliterator::createInstance("Latin-Telugu", UTRANS_FORWARD, parseError, status);
2773 Transliterator* devToLatin=Transliterator::createInstance("Telugu-Latin", UTRANS_FORWARD, parseError, status);
2774 if(U_FAILURE(status)){
2775 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2776 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2777 return;
2778 }
2779 UnicodeString gotResult;
2780 for(int i= 0; i<MAX_LEN; i++){
2781 gotResult = source[i];
2782 expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2783 expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2784 }
2785 delete latinToDev;
2786 delete devToLatin;
2787 }
2788
TestSanskritLatinRT()2789 void TransliteratorTest::TestSanskritLatinRT(){
2790 const int MAX_LEN =16;
2791 const char* const source[MAX_LEN] = {
2792 "rmk\\u1E63\\u0113t",
2793 "\\u015Br\\u012Bmad",
2794 "bhagavadg\\u012Bt\\u0101",
2795 "adhy\\u0101ya",
2796 "arjuna",
2797 "vi\\u1E63\\u0101da",
2798 "y\\u014Dga",
2799 "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2800 "uv\\u0101cr\\u0325",
2801 "dharmak\\u1E63\\u0113tr\\u0113",
2802 "kuruk\\u1E63\\u0113tr\\u0113",
2803 "samav\\u0113t\\u0101",
2804 "yuyutsava\\u1E25",
2805 "m\\u0101mak\\u0101\\u1E25",
2806 // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2807 "kimakurvata",
2808 "san\\u0304java",
2809 };
2810 const char* const expected[MAX_LEN] = {
2811 "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2812 "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2813 "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2814 "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2815 "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2816 "\\u0935\\u093f\\u0937\\u093e\\u0926",
2817 "\\u092f\\u094b\\u0917",
2818 "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2819 "\\u0909\\u0935\\u093E\\u091A\\u0943",
2820 "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2821 "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2822 "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2823 "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2824 "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2825 //"\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2826 "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2827 "\\u0938\\u0902\\u091c\\u0935",
2828 };
2829 UErrorCode status = U_ZERO_ERROR;
2830 UParseError parseError;
2831 UnicodeString message;
2832 Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2833 Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2834 if(U_FAILURE(status)){
2835 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2836 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2837 return;
2838 }
2839 UnicodeString gotResult;
2840 for(int i= 0; i<MAX_LEN; i++){
2841 gotResult = source[i];
2842 expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2843 expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2844 }
2845 delete latinToDev;
2846 delete devToLatin;
2847 }
2848
2849
TestCompoundLatinRT()2850 void TransliteratorTest::TestCompoundLatinRT(){
2851 const char* const source[] = {
2852 "rmk\\u1E63\\u0113t",
2853 "\\u015Br\\u012Bmad",
2854 "bhagavadg\\u012Bt\\u0101",
2855 "adhy\\u0101ya",
2856 "arjuna",
2857 "vi\\u1E63\\u0101da",
2858 "y\\u014Dga",
2859 "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2860 "uv\\u0101cr\\u0325",
2861 "dharmak\\u1E63\\u0113tr\\u0113",
2862 "kuruk\\u1E63\\u0113tr\\u0113",
2863 "samav\\u0113t\\u0101",
2864 "yuyutsava\\u1E25",
2865 "m\\u0101mak\\u0101\\u1E25",
2866 // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2867 "kimakurvata",
2868 "san\\u0304java"
2869 };
2870 const int MAX_LEN = UPRV_LENGTHOF(source);
2871 const char* const expected[MAX_LEN] = {
2872 "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2873 "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2874 "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2875 "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2876 "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2877 "\\u0935\\u093f\\u0937\\u093e\\u0926",
2878 "\\u092f\\u094b\\u0917",
2879 "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2880 "\\u0909\\u0935\\u093E\\u091A\\u0943",
2881 "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2882 "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2883 "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2884 "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2885 "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2886 // "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2887 "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2888 "\\u0938\\u0902\\u091c\\u0935"
2889 };
2890 if(MAX_LEN != UPRV_LENGTHOF(expected)) {
2891 errln("error in TestCompoundLatinRT: source[] and expected[] have different lengths!");
2892 return;
2893 }
2894
2895 UErrorCode status = U_ZERO_ERROR;
2896 UParseError parseError;
2897 UnicodeString message;
2898 Transliterator* devToLatinToDev =Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2899 Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2900 Transliterator* devToTelToDev =Transliterator::createInstance("Devanagari-Telugu;Telugu-Devanagari", UTRANS_FORWARD, parseError, status);
2901 Transliterator* latinToTelToLatin=Transliterator::createInstance("Latin-Telugu;Telugu-Latin", UTRANS_FORWARD, parseError, status);
2902
2903 if(U_FAILURE(status)){
2904 dataerrln("FAIL: construction " + UnicodeString(" Error: ") + u_errorName(status));
2905 dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2906 return;
2907 }
2908 UnicodeString gotResult;
2909 for(int i= 0; i<MAX_LEN; i++){
2910 gotResult = source[i];
2911 expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2912 expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2913 expect(*latinToTelToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2914
2915 }
2916 delete(latinToDevToLatin);
2917 delete(devToLatinToDev);
2918 delete(devToTelToDev);
2919 delete(latinToTelToLatin);
2920 }
2921
2922 /**
2923 * Test Gurmukhi-Devanagari Tippi and Bindi
2924 */
TestGurmukhiDevanagari()2925 void TransliteratorTest::TestGurmukhiDevanagari(){
2926 // the rule says:
2927 // (\u0902) (when preceded by vowel) ---> (\u0A02)
2928 // (\u0902) (when preceded by consonant) ---> (\u0A70)
2929 UErrorCode status = U_ZERO_ERROR;
2930 UnicodeSet vowel(UnicodeString("[\\u0905-\\u090A \\u090F\\u0910\\u0913\\u0914 \\u093e-\\u0942\\u0947\\u0948\\u094B\\u094C\\u094D]", -1, US_INV).unescape(), status);
2931 UnicodeSet non_vowel(UnicodeString("[\\u0915-\\u0928\\u092A-\\u0930]", -1, US_INV).unescape(), status);
2932 UParseError parseError;
2933
2934 UnicodeSetIterator vIter(vowel);
2935 UnicodeSetIterator nvIter(non_vowel);
2936 Transliterator* trans = Transliterator::createInstance("Devanagari-Gurmukhi",UTRANS_FORWARD, parseError, status);
2937 if(U_FAILURE(status)) {
2938 dataerrln("Error creating transliterator %s", u_errorName(status));
2939 delete trans;
2940 return;
2941 }
2942 UnicodeString src (" \\u0902", -1, US_INV);
2943 UnicodeString expected(" \\u0A02", -1, US_INV);
2944 src = src.unescape();
2945 expected= expected.unescape();
2946
2947 while(vIter.next()){
2948 src.setCharAt(0,(UChar) vIter.getCodepoint());
2949 expected.setCharAt(0,(UChar) (vIter.getCodepoint()+0x0100));
2950 expect(*trans,src,expected);
2951 }
2952
2953 expected.setCharAt(1,0x0A70);
2954 while(nvIter.next()){
2955 //src.setCharAt(0,(char) nvIter.codepoint);
2956 src.setCharAt(0,(UChar)nvIter.getCodepoint());
2957 expected.setCharAt(0,(UChar) (nvIter.getCodepoint()+0x0100));
2958 expect(*trans,src,expected);
2959 }
2960 delete trans;
2961 }
2962 /**
2963 * Test instantiation from a locale.
2964 */
TestLocaleInstantiation(void)2965 void TransliteratorTest::TestLocaleInstantiation(void) {
2966 UParseError pe;
2967 UErrorCode ec = U_ZERO_ERROR;
2968 Transliterator *t = Transliterator::createInstance("ru_RU-Latin", UTRANS_FORWARD, pe, ec);
2969 if (U_FAILURE(ec)) {
2970 dataerrln("FAIL: createInstance(ru_RU-Latin) - %s", u_errorName(ec));
2971 delete t;
2972 return;
2973 }
2974 expect(*t, CharsToUnicodeString("\\u0430"), "a");
2975 delete t;
2976
2977 t = Transliterator::createInstance("en-el", UTRANS_FORWARD, pe, ec);
2978 if (U_FAILURE(ec)) {
2979 errln("FAIL: createInstance(en-el)");
2980 delete t;
2981 return;
2982 }
2983 expect(*t, "a", CharsToUnicodeString("\\u03B1"));
2984 delete t;
2985 }
2986
2987 /**
2988 * Test title case handling of accent (should ignore accents)
2989 */
TestTitleAccents(void)2990 void TransliteratorTest::TestTitleAccents(void) {
2991 UParseError pe;
2992 UErrorCode ec = U_ZERO_ERROR;
2993 Transliterator *t = Transliterator::createInstance("Title", UTRANS_FORWARD, pe, ec);
2994 if (U_FAILURE(ec)) {
2995 errln("FAIL: createInstance(Title)");
2996 delete t;
2997 return;
2998 }
2999 expect(*t, CharsToUnicodeString("a\\u0300b can't abe"), CharsToUnicodeString("A\\u0300b Can't Abe"));
3000 delete t;
3001 }
3002
3003 /**
3004 * Basic test of a locale resource based rule.
3005 */
TestLocaleResource()3006 void TransliteratorTest::TestLocaleResource() {
3007 const char* DATA[] = {
3008 // id from to
3009 //"Latin-Greek/UNGEGN", "b", "\\u03bc\\u03c0",
3010 "Latin-el", "b", "\\u03bc\\u03c0",
3011 "Latin-Greek", "b", "\\u03B2",
3012 "Greek-Latin/UNGEGN", "\\u03B2", "v",
3013 "el-Latin", "\\u03B2", "v",
3014 "Greek-Latin", "\\u03B2", "b",
3015 };
3016 const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3017 for (int32_t i=0; i<DATA_length; i+=3) {
3018 UParseError pe;
3019 UErrorCode ec = U_ZERO_ERROR;
3020 Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, pe, ec);
3021 if (U_FAILURE(ec)) {
3022 dataerrln((UnicodeString)"FAIL: createInstance(" + DATA[i] + ") - " + u_errorName(ec));
3023 delete t;
3024 continue;
3025 }
3026 expect(*t, CharsToUnicodeString(DATA[i+1]),
3027 CharsToUnicodeString(DATA[i+2]));
3028 delete t;
3029 }
3030 }
3031
3032 /**
3033 * Make sure parse errors reference the right line.
3034 */
TestParseError()3035 void TransliteratorTest::TestParseError() {
3036 static const char* rule =
3037 "a > b;\n"
3038 "# more stuff\n"
3039 "d << b;";
3040 UErrorCode ec = U_ZERO_ERROR;
3041 UParseError pe;
3042 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3043 delete t;
3044 if (U_FAILURE(ec)) {
3045 UnicodeString err(pe.preContext);
3046 err.append((UChar)124/*|*/).append(pe.postContext);
3047 if (err.indexOf("d << b") >= 0) {
3048 logln("Ok: " + err);
3049 } else {
3050 errln("FAIL: " + err);
3051 }
3052 }
3053 else {
3054 errln("FAIL: no syntax error");
3055 }
3056 static const char* maskingRule =
3057 "a>x;\n"
3058 "# more stuff\n"
3059 "ab>y;";
3060 ec = U_ZERO_ERROR;
3061 delete Transliterator::createFromRules("ID", maskingRule, UTRANS_FORWARD, pe, ec);
3062 if (ec != U_RULE_MASK_ERROR) {
3063 errln("FAIL: returned %s instead of U_RULE_MASK_ERROR", u_errorName(ec));
3064 }
3065 else if (UnicodeString("a > x;") != UnicodeString(pe.preContext)) {
3066 errln("FAIL: did not get expected precontext");
3067 }
3068 else if (UnicodeString("ab > y;") != UnicodeString(pe.postContext)) {
3069 errln("FAIL: did not get expected postcontext");
3070 }
3071 }
3072
3073 /**
3074 * Make sure sets on output are disallowed.
3075 */
TestOutputSet()3076 void TransliteratorTest::TestOutputSet() {
3077 UnicodeString rule = "$set = [a-cm-n]; b > $set;";
3078 UErrorCode ec = U_ZERO_ERROR;
3079 UParseError pe;
3080 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3081 delete t;
3082 if (U_FAILURE(ec)) {
3083 UnicodeString err(pe.preContext);
3084 err.append((UChar)124/*|*/).append(pe.postContext);
3085 logln("Ok: " + err);
3086 return;
3087 }
3088 errln("FAIL: No syntax error");
3089 }
3090
3091 /**
3092 * Test the use variable range pragma, making sure that use of
3093 * variable range characters is detected and flagged as an error.
3094 */
TestVariableRange()3095 void TransliteratorTest::TestVariableRange() {
3096 UnicodeString rule = "use variable range 0x70 0x72; a > A; b > B; q > Q;";
3097 UErrorCode ec = U_ZERO_ERROR;
3098 UParseError pe;
3099 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3100 delete t;
3101 if (U_FAILURE(ec)) {
3102 UnicodeString err(pe.preContext);
3103 err.append((UChar)124/*|*/).append(pe.postContext);
3104 logln("Ok: " + err);
3105 return;
3106 }
3107 errln("FAIL: No syntax error");
3108 }
3109
3110 /**
3111 * Test invalid post context error handling
3112 */
TestInvalidPostContext()3113 void TransliteratorTest::TestInvalidPostContext() {
3114 UnicodeString rule = "a}b{c>d;";
3115 UErrorCode ec = U_ZERO_ERROR;
3116 UParseError pe;
3117 Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3118 delete t;
3119 if (U_FAILURE(ec)) {
3120 UnicodeString err(pe.preContext);
3121 err.append((UChar)124/*|*/).append(pe.postContext);
3122 if (err.indexOf("a}b{c") >= 0) {
3123 logln("Ok: " + err);
3124 } else {
3125 errln("FAIL: " + err);
3126 }
3127 return;
3128 }
3129 errln("FAIL: No syntax error");
3130 }
3131
3132 /**
3133 * Test ID form variants
3134 */
TestIDForms()3135 void TransliteratorTest::TestIDForms() {
3136 const char* DATA[] = {
3137 "NFC", NULL, "NFD",
3138 "nfd", NULL, "NFC", // make sure case is ignored
3139 "Any-NFKD", NULL, "Any-NFKC",
3140 "Null", NULL, "Null",
3141 "-nfkc", "nfkc", "NFKD",
3142 "-nfkc/", "nfkc", "NFKD",
3143 "Latin-Greek/UNGEGN", NULL, "Greek-Latin/UNGEGN",
3144 "Greek/UNGEGN-Latin", "Greek-Latin/UNGEGN", "Latin-Greek/UNGEGN",
3145 "Bengali-Devanagari/", "Bengali-Devanagari", "Devanagari-Bengali",
3146 "Source-", NULL, NULL,
3147 "Source/Variant-", NULL, NULL,
3148 "Source-/Variant", NULL, NULL,
3149 "/Variant", NULL, NULL,
3150 "/Variant-", NULL, NULL,
3151 "-/Variant", NULL, NULL,
3152 "-/", NULL, NULL,
3153 "-", NULL, NULL,
3154 "/", NULL, NULL,
3155 };
3156 const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3157
3158 for (int32_t i=0; i<DATA_length; i+=3) {
3159 const char* ID = DATA[i];
3160 const char* expID = DATA[i+1];
3161 const char* expInvID = DATA[i+2];
3162 UBool expValid = (expInvID != NULL);
3163 if (expID == NULL) {
3164 expID = ID;
3165 }
3166 UParseError pe;
3167 UErrorCode ec = U_ZERO_ERROR;
3168 Transliterator *t =
3169 Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
3170 if (U_FAILURE(ec)) {
3171 if (!expValid) {
3172 logln((UnicodeString)"Ok: getInstance(" + ID +") => " + u_errorName(ec));
3173 } else {
3174 dataerrln((UnicodeString)"FAIL: Couldn't create " + ID + " - " + u_errorName(ec));
3175 }
3176 delete t;
3177 continue;
3178 }
3179 Transliterator *u = t->createInverse(ec);
3180 if (U_FAILURE(ec)) {
3181 errln((UnicodeString)"FAIL: Couldn't create inverse of " + ID);
3182 delete t;
3183 delete u;
3184 continue;
3185 }
3186 if (t->getID() == expID &&
3187 u->getID() == expInvID) {
3188 logln((UnicodeString)"Ok: " + ID + ".getInverse() => " + expInvID);
3189 } else {
3190 errln((UnicodeString)"FAIL: getInstance(" + ID + ") => " +
3191 t->getID() + " x getInverse() => " + u->getID() +
3192 ", expected " + expInvID);
3193 }
3194 delete t;
3195 delete u;
3196 }
3197 }
3198
3199 static const UChar SPACE[] = {32,0};
3200 static const UChar NEWLINE[] = {10,0};
3201 static const UChar RETURN[] = {13,0};
3202 static const UChar EMPTY[] = {0};
3203
checkRules(const UnicodeString & label,Transliterator & t2,const UnicodeString & testRulesForward)3204 void TransliteratorTest::checkRules(const UnicodeString& label, Transliterator& t2,
3205 const UnicodeString& testRulesForward) {
3206 UnicodeString rules2; t2.toRules(rules2, TRUE);
3207 //rules2 = TestUtility.replaceAll(rules2, new UnicodeSet("[' '\n\r]"), "");
3208 rules2.findAndReplace(SPACE, EMPTY);
3209 rules2.findAndReplace(NEWLINE, EMPTY);
3210 rules2.findAndReplace(RETURN, EMPTY);
3211
3212 UnicodeString testRules(testRulesForward); testRules.findAndReplace(SPACE, EMPTY);
3213
3214 if (rules2 != testRules) {
3215 errln(label);
3216 logln((UnicodeString)"GENERATED RULES: " + rules2);
3217 logln((UnicodeString)"SHOULD BE: " + testRulesForward);
3218 }
3219 }
3220
3221 /**
3222 * Mark's toRules test.
3223 */
TestToRulesMark()3224 void TransliteratorTest::TestToRulesMark() {
3225 const char* testRules =
3226 "::[[:Latin:][:Mark:]];"
3227 "::NFKD (NFC);"
3228 "::Lower (Lower);"
3229 "a <> \\u03B1;" // alpha
3230 "::NFKC (NFD);"
3231 "::Upper (Lower);"
3232 "::Lower ();"
3233 "::([[:Greek:][:Mark:]]);"
3234 ;
3235 const char* testRulesForward =
3236 "::[[:Latin:][:Mark:]];"
3237 "::NFKD(NFC);"
3238 "::Lower(Lower);"
3239 "a > \\u03B1;"
3240 "::NFKC(NFD);"
3241 "::Upper (Lower);"
3242 "::Lower ();"
3243 ;
3244 const char* testRulesBackward =
3245 "::[[:Greek:][:Mark:]];"
3246 "::Lower (Upper);"
3247 "::NFD(NFKC);"
3248 "\\u03B1 > a;"
3249 "::Lower(Lower);"
3250 "::NFC(NFKD);"
3251 ;
3252 UnicodeString source = CharsToUnicodeString("\\u00E1"); // a-acute
3253 UnicodeString target = CharsToUnicodeString("\\u03AC"); // alpha-acute
3254
3255 UParseError pe;
3256 UErrorCode ec = U_ZERO_ERROR;
3257 LocalPointer<Transliterator> t2(
3258 Transliterator::createFromRules("source-target", UnicodeString(testRules, -1, US_INV), UTRANS_FORWARD, pe, ec));
3259 LocalPointer<Transliterator> t3(
3260 Transliterator::createFromRules("target-source", UnicodeString(testRules, -1, US_INV), UTRANS_REVERSE, pe, ec));
3261
3262 if (U_FAILURE(ec)) {
3263 dataerrln((UnicodeString)"FAIL: createFromRules => " + u_errorName(ec));
3264 return;
3265 }
3266
3267 expect(*t2, source, target);
3268 expect(*t3, target, source);
3269
3270 checkRules("Failed toRules FORWARD", *t2, UnicodeString(testRulesForward, -1, US_INV));
3271 checkRules("Failed toRules BACKWARD", *t3, UnicodeString(testRulesBackward, -1, US_INV));
3272 }
3273
3274 /**
3275 * Test Escape and Unescape transliterators.
3276 */
TestEscape()3277 void TransliteratorTest::TestEscape() {
3278 UParseError pe;
3279 UErrorCode ec;
3280 Transliterator *t;
3281
3282 ec = U_ZERO_ERROR;
3283 t = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, pe, ec);
3284 if (U_FAILURE(ec)) {
3285 errln((UnicodeString)"FAIL: createInstance");
3286 } else {
3287 expect(*t,
3288 UNICODE_STRING_SIMPLE("\\x{40}\\U000000312Q"),
3289 "@12Q");
3290 }
3291 delete t;
3292
3293 ec = U_ZERO_ERROR;
3294 t = Transliterator::createInstance("Any-Hex/C", UTRANS_FORWARD, pe, ec);
3295 if (U_FAILURE(ec)) {
3296 errln((UnicodeString)"FAIL: createInstance");
3297 } else {
3298 expect(*t,
3299 CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3300 UNICODE_STRING_SIMPLE("\\u0041\\U0010BEEF\\uFEED"));
3301 }
3302 delete t;
3303
3304 ec = U_ZERO_ERROR;
3305 t = Transliterator::createInstance("Any-Hex/Java", UTRANS_FORWARD, pe, ec);
3306 if (U_FAILURE(ec)) {
3307 errln((UnicodeString)"FAIL: createInstance");
3308 } else {
3309 expect(*t,
3310 CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3311 UNICODE_STRING_SIMPLE("\\u0041\\uDBEF\\uDEEF\\uFEED"));
3312 }
3313 delete t;
3314
3315 ec = U_ZERO_ERROR;
3316 t = Transliterator::createInstance("Any-Hex/Perl", UTRANS_FORWARD, pe, ec);
3317 if (U_FAILURE(ec)) {
3318 errln((UnicodeString)"FAIL: createInstance");
3319 } else {
3320 expect(*t,
3321 CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3322 UNICODE_STRING_SIMPLE("\\x{41}\\x{10BEEF}\\x{FEED}"));
3323 }
3324 delete t;
3325 }
3326
3327
TestAnchorMasking()3328 void TransliteratorTest::TestAnchorMasking(){
3329 UnicodeString rule ("^a > Q; a > q;");
3330 UErrorCode status= U_ZERO_ERROR;
3331 UParseError parseError;
3332
3333 Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status);
3334 if(U_FAILURE(status)){
3335 errln(UnicodeString("FAIL: ") + "ID" +
3336 ".createFromRules() => bad rules" +
3337 /*", parse error " + parseError.code +*/
3338 ", line " + parseError.line +
3339 ", offset " + parseError.offset +
3340 ", context " + prettify(parseError.preContext, TRUE) +
3341 ", rules: " + prettify(rule, TRUE));
3342 }
3343 delete t;
3344 }
3345
3346 /**
3347 * Make sure display names of variants look reasonable.
3348 */
TestDisplayName()3349 void TransliteratorTest::TestDisplayName() {
3350 #if UCONFIG_NO_FORMATTING
3351 logln("Skipping, UCONFIG_NO_FORMATTING is set\n");
3352 return;
3353 #else
3354 static const char* DATA[] = {
3355 // ID, forward name, reverse name
3356 // Update the text as necessary -- the important thing is
3357 // not the text itself, but how various cases are handled.
3358
3359 // Basic test
3360 "Any-Hex", "Any to Hex Escape", "Hex Escape to Any",
3361
3362 // Variants
3363 "Any-Hex/Perl", "Any to Hex Escape/Perl", "Hex Escape to Any/Perl",
3364
3365 // Target-only IDs
3366 "NFC", "Any to NFC", "Any to NFD",
3367 };
3368
3369 int32_t DATA_length = UPRV_LENGTHOF(DATA);
3370
3371 Locale US("en", "US");
3372
3373 for (int32_t i=0; i<DATA_length; i+=3) {
3374 UnicodeString name;
3375 Transliterator::getDisplayName(DATA[i], US, name);
3376 if (name != DATA[i+1]) {
3377 dataerrln((UnicodeString)"FAIL: " + DATA[i] + ".getDisplayName() => " +
3378 name + ", expected " + DATA[i+1]);
3379 } else {
3380 logln((UnicodeString)"Ok: " + DATA[i] + ".getDisplayName() => " + name);
3381 }
3382 UErrorCode ec = U_ZERO_ERROR;
3383 UParseError pe;
3384 Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_REVERSE, pe, ec);
3385 if (U_FAILURE(ec)) {
3386 delete t;
3387 dataerrln("FAIL: createInstance failed - %s", u_errorName(ec));
3388 continue;
3389 }
3390 name = Transliterator::getDisplayName(t->getID(), US, name);
3391 if (name != DATA[i+2]) {
3392 dataerrln((UnicodeString)"FAIL: " + t->getID() + ".getDisplayName() => " +
3393 name + ", expected " + DATA[i+2]);
3394 } else {
3395 logln((UnicodeString)"Ok: " + t->getID() + ".getDisplayName() => " + name);
3396 }
3397 delete t;
3398 }
3399 #endif
3400 }
3401
TestSpecialCases(void)3402 void TransliteratorTest::TestSpecialCases(void) {
3403 const UnicodeString registerRules[] = {
3404 "Any-Dev1", "x > X; y > Y;",
3405 "Any-Dev2", "XY > Z",
3406 "Greek-Latin/FAKE",
3407 CharsToUnicodeString
3408 ("[^[:L:][:M:]] { \\u03bc\\u03c0 > b ; \\u03bc\\u03c0 } [^[:L:][:M:]] > b ; [^[:L:][:M:]] { [\\u039c\\u03bc][\\u03a0\\u03c0] > B ; [\\u039c\\u03bc][\\u03a0\\u03c0] } [^[:L:][:M:]] > B ;"),
3409 "" // END MARKER
3410 };
3411
3412 const UnicodeString testCases[] = {
3413 // NORMALIZATION
3414 // should add more test cases
3415 "NFD" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3416 "NFC" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3417 "NFKD", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3418 "NFKC", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3419
3420 // mp -> b BUG
3421 "Greek-Latin/UNGEGN", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3422 "Greek-Latin/FAKE", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3423
3424 // check for devanagari bug
3425 "nfd;Dev1;Dev2;nfc", "xy", "Z",
3426
3427 // ff, i, dotless-i, I, dotted-I, LJLjlj deseret deeDEE
3428 "Title", CharsToUnicodeString("ab'cD ffi\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3429 CharsToUnicodeString("Ab'cd Ffi\\u0131ii\\u0307 \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3430
3431 //TODO: enable this test once Titlecase works right
3432 /*
3433 "Title", CharsToUnicodeString("\\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3434 CharsToUnicodeString("Ffi\\u0131ii \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3435 */
3436 "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3437 CharsToUnicodeString("AB'CD FFIII\\u0130 \\u01C7\\u01C7\\u01C7 ") + DESERET_DEE + DESERET_DEE,
3438 "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3439 CharsToUnicodeString("ab'cd \\uFB00i\\u0131ii\\u0307 \\u01C9\\u01C9\\u01C9 ") + DESERET_dee + DESERET_dee,
3440
3441 "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3442 "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3443
3444 // FORMS OF S
3445 "Greek-Latin/UNGEGN", CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3446 CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3447 "Latin-Greek/UNGEGN", CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3448 CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3") ,
3449 "Greek-Latin", CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3450 CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3451 "Latin-Greek", CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3452 CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3453 // Tatiana bug
3454 // Upper: TAT\\u02B9\\u00C2NA
3455 // Lower: tat\\u02B9\\u00E2na
3456 // Title: Tat\\u02B9\\u00E2na
3457 "Upper", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3458 CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3459 "Lower", CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3460 CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3461 "Title", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3462 CharsToUnicodeString("Tat\\u02B9\\u00E2na"),
3463
3464 "" // END MARKER
3465 };
3466
3467 UParseError pos;
3468 int32_t i;
3469 for (i = 0; registerRules[i].length()!=0; i+=2) {
3470 UErrorCode status = U_ZERO_ERROR;
3471
3472 Transliterator *t = Transliterator::createFromRules(registerRules[0+i],
3473 registerRules[i+1], UTRANS_FORWARD, pos, status);
3474 if (U_FAILURE(status)) {
3475 dataerrln("Fails: Unable to create the transliterator from rules. - %s", u_errorName(status));
3476 } else {
3477 Transliterator::registerInstance(t);
3478 }
3479 }
3480 for (i = 0; testCases[i].length()!=0; i+=3) {
3481 UErrorCode ec = U_ZERO_ERROR;
3482 UParseError pe;
3483 const UnicodeString& name = testCases[i];
3484 Transliterator *t = Transliterator::createInstance(name, UTRANS_FORWARD, pe, ec);
3485 if (U_FAILURE(ec)) {
3486 dataerrln((UnicodeString)"FAIL: Couldn't create " + name + " - " + u_errorName(ec));
3487 delete t;
3488 continue;
3489 }
3490 const UnicodeString& id = t->getID();
3491 const UnicodeString& source = testCases[i+1];
3492 UnicodeString target;
3493
3494 // Automatic generation of targets, to make it simpler to add test cases (and more fail-safe)
3495
3496 if (testCases[i+2].length() > 0) {
3497 target = testCases[i+2];
3498 } else if (0==id.caseCompare("NFD", U_FOLD_CASE_DEFAULT)) {
3499 Normalizer::normalize(source, UNORM_NFD, 0, target, ec);
3500 } else if (0==id.caseCompare("NFC", U_FOLD_CASE_DEFAULT)) {
3501 Normalizer::normalize(source, UNORM_NFC, 0, target, ec);
3502 } else if (0==id.caseCompare("NFKD", U_FOLD_CASE_DEFAULT)) {
3503 Normalizer::normalize(source, UNORM_NFKD, 0, target, ec);
3504 } else if (0==id.caseCompare("NFKC", U_FOLD_CASE_DEFAULT)) {
3505 Normalizer::normalize(source, UNORM_NFKC, 0, target, ec);
3506 } else if (0==id.caseCompare("Lower", U_FOLD_CASE_DEFAULT)) {
3507 target = source;
3508 target.toLower(Locale::getUS());
3509 } else if (0==id.caseCompare("Upper", U_FOLD_CASE_DEFAULT)) {
3510 target = source;
3511 target.toUpper(Locale::getUS());
3512 }
3513 if (U_FAILURE(ec)) {
3514 errln((UnicodeString)"FAIL: Internal error normalizing " + source);
3515 continue;
3516 }
3517
3518 expect(*t, source, target);
3519 delete t;
3520 }
3521 for (i = 0; registerRules[i].length()!=0; i+=2) {
3522 Transliterator::unregister(registerRules[i]);
3523 }
3524 }
3525
Char32ToEscapedChars(UChar32 ch,char * buffer)3526 char* Char32ToEscapedChars(UChar32 ch, char* buffer) {
3527 if (ch <= 0xFFFF) {
3528 sprintf(buffer, "\\u%04x", (int)ch);
3529 } else {
3530 sprintf(buffer, "\\U%08x", (int)ch);
3531 }
3532 return buffer;
3533 }
3534
TestSurrogateCasing(void)3535 void TransliteratorTest::TestSurrogateCasing (void) {
3536 // check that casing handles surrogates
3537 // titlecase is currently defective
3538 char buffer[20];
3539 UChar buffer2[20];
3540 UChar32 dee;
3541 U16_GET(DESERET_dee,0, 0, DESERET_dee.length(), dee);
3542 UnicodeString DEE(u_totitle(dee));
3543 if (DEE != DESERET_DEE) {
3544 err("Fails titlecase of surrogates");
3545 err(Char32ToEscapedChars(dee, buffer));
3546 err(", ");
3547 errln(Char32ToEscapedChars(DEE.char32At(0), buffer));
3548 }
3549
3550 UnicodeString deeDEETest=DESERET_dee + DESERET_DEE;
3551 UnicodeString deedeeTest = DESERET_dee + DESERET_dee;
3552 UnicodeString DEEDEETest = DESERET_DEE + DESERET_DEE;
3553 UErrorCode status= U_ZERO_ERROR;
3554
3555 u_strToUpper(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
3556 if (U_FAILURE(status) || (UnicodeString(buffer2)!= DEEDEETest)) {
3557 errln("Fails: Can't uppercase surrogates.");
3558 }
3559
3560 status= U_ZERO_ERROR;
3561 u_strToLower(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
3562 if (U_FAILURE(status) || (UnicodeString(buffer2)!= deedeeTest)) {
3563 errln("Fails: Can't lowercase surrogates.");
3564 }
3565 }
3566
_trans(Transliterator & t,const UnicodeString & src,UnicodeString & result)3567 static void _trans(Transliterator& t, const UnicodeString& src,
3568 UnicodeString& result) {
3569 result = src;
3570 t.transliterate(result);
3571 }
3572
_trans(const UnicodeString & id,const UnicodeString & src,UnicodeString & result,UErrorCode ec)3573 static void _trans(const UnicodeString& id, const UnicodeString& src,
3574 UnicodeString& result, UErrorCode ec) {
3575 UParseError pe;
3576 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
3577 if (U_SUCCESS(ec)) {
3578 _trans(*t, src, result);
3579 }
3580 delete t;
3581 }
3582
_findMatch(const UnicodeString & source,const UnicodeString * pairs)3583 static UnicodeString _findMatch(const UnicodeString& source,
3584 const UnicodeString* pairs) {
3585 UnicodeString empty;
3586 for (int32_t i=0; pairs[i].length() > 0; i+=2) {
3587 if (0==source.caseCompare(pairs[i], U_FOLD_CASE_DEFAULT)) {
3588 return pairs[i+1];
3589 }
3590 }
3591 return empty;
3592 }
3593
3594 // Check to see that incremental gets at least part way through a reasonable string.
3595
TestIncrementalProgress(void)3596 void TransliteratorTest::TestIncrementalProgress(void) {
3597 UErrorCode ec = U_ZERO_ERROR;
3598 UnicodeString latinTest = "The Quick Brown Fox.";
3599 UnicodeString devaTest;
3600 _trans("Latin-Devanagari", latinTest, devaTest, ec);
3601 UnicodeString kataTest;
3602 _trans("Latin-Katakana", latinTest, kataTest, ec);
3603 if (U_FAILURE(ec)) {
3604 errln("FAIL: Internal error");
3605 return;
3606 }
3607 const UnicodeString tests[] = {
3608 "Any", latinTest,
3609 "Latin", latinTest,
3610 "Halfwidth", latinTest,
3611 "Devanagari", devaTest,
3612 "Katakana", kataTest,
3613 "" // END MARKER
3614 };
3615
3616 UnicodeString test("The Quick Brown Fox Jumped Over The Lazy Dog.");
3617 int32_t i = 0, j=0, k=0;
3618 int32_t sources = Transliterator::countAvailableSources();
3619 for (i = 0; i < sources; i++) {
3620 UnicodeString source;
3621 Transliterator::getAvailableSource(i, source);
3622 UnicodeString test = _findMatch(source, tests);
3623 if (test.length() == 0) {
3624 logln((UnicodeString)"Skipping " + source + "-X");
3625 continue;
3626 }
3627 int32_t targets = Transliterator::countAvailableTargets(source);
3628 for (j = 0; j < targets; j++) {
3629 UnicodeString target;
3630 Transliterator::getAvailableTarget(j, source, target);
3631 int32_t variants = Transliterator::countAvailableVariants(source, target);
3632 for (k =0; k< variants; k++) {
3633 UnicodeString variant;
3634 UParseError err;
3635 UErrorCode status = U_ZERO_ERROR;
3636
3637 Transliterator::getAvailableVariant(k, source, target, variant);
3638 UnicodeString id = source + "-" + target + "/" + variant;
3639
3640 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, err, status);
3641 if (U_FAILURE(status)) {
3642 dataerrln((UnicodeString)"FAIL: Could not create " + id);
3643 delete t;
3644 continue;
3645 }
3646 status = U_ZERO_ERROR;
3647 CheckIncrementalAux(t, test);
3648
3649 UnicodeString rev;
3650 _trans(*t, test, rev);
3651 Transliterator *inv = t->createInverse(status);
3652 if (U_FAILURE(status)) {
3653 // The following are forward-only, it is OK that creating an inverse will not work:
3654 // 1. Devanagari-Arabic
3655 // 2. Any-*/BGN
3656 // 2a. Any-*/BGN_1981
3657 // 3. Any-*/UNGEGN
3658 // 4. Any-*/MNS
3659 // If UCONFIG_NO_BREAK_ITERATION is on, Latin-Thai is also not expected to work.
3660 if ( id.compare((UnicodeString)"Devanagari-Arabic/") != 0
3661 && !(id.startsWith((UnicodeString)"Any-") &&
3662 (id.endsWith((UnicodeString)"/BGN") || id.endsWith((UnicodeString)"/BGN_1981") || id.endsWith((UnicodeString)"/UNGEGN") || id.endsWith((UnicodeString)"/MNS"))
3663 )
3664 #if UCONFIG_NO_BREAK_ITERATION
3665 && id.compare((UnicodeString)"Latin-Thai/") != 0
3666 #endif
3667 )
3668 {
3669 errln((UnicodeString)"FAIL: Could not create inverse of " + id);
3670 }
3671 delete t;
3672 delete inv;
3673 continue;
3674 }
3675 CheckIncrementalAux(inv, rev);
3676 delete t;
3677 delete inv;
3678 }
3679 }
3680 }
3681 }
3682
CheckIncrementalAux(const Transliterator * t,const UnicodeString & input)3683 void TransliteratorTest::CheckIncrementalAux(const Transliterator* t,
3684 const UnicodeString& input) {
3685 UErrorCode ec = U_ZERO_ERROR;
3686 UTransPosition pos;
3687 UnicodeString test = input;
3688
3689 pos.contextStart = 0;
3690 pos.contextLimit = input.length();
3691 pos.start = 0;
3692 pos.limit = input.length();
3693
3694 t->transliterate(test, pos, ec);
3695 if (U_FAILURE(ec)) {
3696 errln((UnicodeString)"FAIL: transliterate() error " + u_errorName(ec));
3697 return;
3698 }
3699 UBool gotError = FALSE;
3700 (void)gotError; // Suppress set but not used warning.
3701
3702 // we have a few special cases. Any-Remove (pos.start = 0, but also = limit) and U+XXXXX?X?
3703
3704 if (pos.start == 0 && pos.limit != 0 && t->getID() != "Hex-Any/Unicode") {
3705 errln((UnicodeString)"No Progress, " +
3706 t->getID() + ": " + formatInput(test, input, pos));
3707 gotError = TRUE;
3708 } else {
3709 logln((UnicodeString)"PASS Progress, " +
3710 t->getID() + ": " + formatInput(test, input, pos));
3711 }
3712 t->finishTransliteration(test, pos);
3713 if (pos.start != pos.limit) {
3714 errln((UnicodeString)"Incomplete, " +
3715 t->getID() + ": " + formatInput(test, input, pos));
3716 gotError = TRUE;
3717 }
3718 }
3719
TestFunction()3720 void TransliteratorTest::TestFunction() {
3721 // Careful with spacing and ';' here: Phrase this exactly
3722 // as toRules() is going to return it. If toRules() changes
3723 // with regard to spacing or ';', then adjust this string.
3724 UnicodeString rule =
3725 "([:Lu:]) > $1 '(' &Lower( $1 ) '=' &Hex( &Any-Lower( $1 ) ) ')';";
3726
3727 UParseError pe;
3728 UErrorCode ec = U_ZERO_ERROR;
3729 Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3730 if (t == NULL) {
3731 dataerrln("FAIL: createFromRules failed - %s", u_errorName(ec));
3732 return;
3733 }
3734
3735 UnicodeString r;
3736 t->toRules(r, TRUE);
3737 if (r == rule) {
3738 logln((UnicodeString)"OK: toRules() => " + r);
3739 } else {
3740 errln((UnicodeString)"FAIL: toRules() => " + r +
3741 ", expected " + rule);
3742 }
3743
3744 expect(*t, "The Quick Brown Fox",
3745 UNICODE_STRING_SIMPLE("T(t=\\u0074)he Q(q=\\u0071)uick B(b=\\u0062)rown F(f=\\u0066)ox"));
3746
3747 delete t;
3748 }
3749
TestInvalidBackRef(void)3750 void TransliteratorTest::TestInvalidBackRef(void) {
3751 UnicodeString rule = ". > $1;";
3752 UnicodeString rule2 =CharsToUnicodeString("(.) <> &hex/unicode($1) &name($1); . > $1; [{}] >\\u0020;");
3753 UParseError pe;
3754 UErrorCode ec = U_ZERO_ERROR;
3755 Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3756 Transliterator *t2 = Transliterator::createFromRules("Test2", rule2, UTRANS_FORWARD, pe, ec);
3757
3758 if (t != NULL) {
3759 errln("FAIL: createFromRules should have returned NULL");
3760 delete t;
3761 }
3762
3763 if (t2 != NULL) {
3764 errln("FAIL: createFromRules should have returned NULL");
3765 delete t2;
3766 }
3767
3768 if (U_SUCCESS(ec)) {
3769 errln("FAIL: Ok: . > $1; => no error");
3770 } else {
3771 logln((UnicodeString)"Ok: . > $1; => " + u_errorName(ec));
3772 }
3773 }
3774
TestMulticharStringSet()3775 void TransliteratorTest::TestMulticharStringSet() {
3776 // Basic testing
3777 const char* rule =
3778 " [{aa}] > x;"
3779 " a > y;"
3780 " [b{bc}] > z;"
3781 "[{gd}] { e > q;"
3782 " e } [{fg}] > r;" ;
3783
3784 UParseError pe;
3785 UErrorCode ec = U_ZERO_ERROR;
3786 Transliterator* t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3787 if (t == NULL || U_FAILURE(ec)) {
3788 delete t;
3789 errln("FAIL: createFromRules failed");
3790 return;
3791 }
3792
3793 expect(*t, "a aa ab bc d gd de gde gdefg ddefg",
3794 "y x yz z d gd de gdq gdqfg ddrfg");
3795 delete t;
3796
3797 // Overlapped string test. Make sure that when multiple
3798 // strings can match that the longest one is matched.
3799 rule =
3800 " [a {ab} {abc}] > x;"
3801 " b > y;"
3802 " c > z;"
3803 " q [t {st} {rst}] { e > p;" ;
3804
3805 t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3806 if (t == NULL || U_FAILURE(ec)) {
3807 delete t;
3808 errln("FAIL: createFromRules failed");
3809 return;
3810 }
3811
3812 expect(*t, "a ab abc qte qste qrste",
3813 "x x x qtp qstp qrstp");
3814 delete t;
3815 }
3816
3817 // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
3818 // BEGIN TestUserFunction support factory
3819
3820 Transliterator* _TUFF[4];
3821 UnicodeString* _TUFID[4];
3822
_TUFFactory(const UnicodeString &,Transliterator::Token context)3823 static Transliterator* U_EXPORT2 _TUFFactory(const UnicodeString& /*ID*/,
3824 Transliterator::Token context) {
3825 return _TUFF[context.integer]->clone();
3826 }
3827
_TUFReg(const UnicodeString & ID,Transliterator * t,int32_t n)3828 static void _TUFReg(const UnicodeString& ID, Transliterator* t, int32_t n) {
3829 _TUFF[n] = t;
3830 _TUFID[n] = new UnicodeString(ID);
3831 Transliterator::registerFactory(ID, _TUFFactory, Transliterator::integerToken(n));
3832 }
3833
_TUFUnreg(int32_t n)3834 static void _TUFUnreg(int32_t n) {
3835 if (_TUFF[n] != NULL) {
3836 Transliterator::unregister(*_TUFID[n]);
3837 delete _TUFF[n];
3838 delete _TUFID[n];
3839 }
3840 }
3841
3842 // END TestUserFunction support factory
3843 // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3844
3845 /**
3846 * Test that user-registered transliterators can be used under function
3847 * syntax.
3848 */
TestUserFunction()3849 void TransliteratorTest::TestUserFunction() {
3850
3851 Transliterator* t;
3852 UParseError pe;
3853 UErrorCode ec = U_ZERO_ERROR;
3854
3855 // Setup our factory
3856 int32_t i;
3857 for (i=0; i<4; ++i) {
3858 _TUFF[i] = NULL;
3859 }
3860
3861 // There's no need to register inverses if we don't use them
3862 t = Transliterator::createFromRules("gif",
3863 UNICODE_STRING_SIMPLE("'\\'u(..)(..) > '<img src=\"http://www.unicode.org/gifs/24/' $1 '/U' $1$2 '.gif\">';"),
3864 UTRANS_FORWARD, pe, ec);
3865 if (t == NULL || U_FAILURE(ec)) {
3866 dataerrln((UnicodeString)"FAIL: createFromRules gif " + u_errorName(ec));
3867 return;
3868 }
3869 _TUFReg("Any-gif", t, 0);
3870
3871 t = Transliterator::createFromRules("RemoveCurly",
3872 UNICODE_STRING_SIMPLE("[\\{\\}] > ; '\\N' > ;"),
3873 UTRANS_FORWARD, pe, ec);
3874 if (t == NULL || U_FAILURE(ec)) {
3875 errln((UnicodeString)"FAIL: createFromRules RemoveCurly " + u_errorName(ec));
3876 goto FAIL;
3877 }
3878 expect(*t, UNICODE_STRING_SIMPLE("\\N{name}"), "name");
3879 _TUFReg("Any-RemoveCurly", t, 1);
3880
3881 logln("Trying &hex");
3882 t = Transliterator::createFromRules("hex2",
3883 "(.) > &hex($1);",
3884 UTRANS_FORWARD, pe, ec);
3885 if (t == NULL || U_FAILURE(ec)) {
3886 errln("FAIL: createFromRules");
3887 goto FAIL;
3888 }
3889 logln("Registering");
3890 _TUFReg("Any-hex2", t, 2);
3891 t = Transliterator::createInstance("Any-hex2", UTRANS_FORWARD, ec);
3892 if (t == NULL || U_FAILURE(ec)) {
3893 errln((UnicodeString)"FAIL: createInstance Any-hex2 " + u_errorName(ec));
3894 goto FAIL;
3895 }
3896 expect(*t, "abc", UNICODE_STRING_SIMPLE("\\u0061\\u0062\\u0063"));
3897 delete t;
3898
3899 logln("Trying &gif");
3900 t = Transliterator::createFromRules("gif2",
3901 "(.) > &Gif(&Hex2($1));",
3902 UTRANS_FORWARD, pe, ec);
3903 if (t == NULL || U_FAILURE(ec)) {
3904 errln((UnicodeString)"FAIL: createFromRules gif2 " + u_errorName(ec));
3905 goto FAIL;
3906 }
3907 logln("Registering");
3908 _TUFReg("Any-gif2", t, 3);
3909 t = Transliterator::createInstance("Any-gif2", UTRANS_FORWARD, ec);
3910 if (t == NULL || U_FAILURE(ec)) {
3911 errln((UnicodeString)"FAIL: createInstance Any-gif2 " + u_errorName(ec));
3912 goto FAIL;
3913 }
3914 expect(*t, "ab", "<img src=\"http://www.unicode.org/gifs/24/00/U0061.gif\">"
3915 "<img src=\"http://www.unicode.org/gifs/24/00/U0062.gif\">");
3916 delete t;
3917
3918 // Test that filters are allowed after &
3919 t = Transliterator::createFromRules("test",
3920 "(.) > &Hex($1) ' ' &RemoveCurly(&Name($1)) ' ';",
3921 UTRANS_FORWARD, pe, ec);
3922 if (t == NULL || U_FAILURE(ec)) {
3923 errln((UnicodeString)"FAIL: createFromRules test " + u_errorName(ec));
3924 goto FAIL;
3925 }
3926 expect(*t, "abc",
3927 UNICODE_STRING_SIMPLE("\\u0061 LATIN SMALL LETTER A \\u0062 LATIN SMALL LETTER B \\u0063 LATIN SMALL LETTER C "));
3928 delete t;
3929
3930 FAIL:
3931 for (i=0; i<4; ++i) {
3932 _TUFUnreg(i);
3933 }
3934 }
3935
3936 /**
3937 * Test the Any-X transliterators.
3938 */
TestAnyX(void)3939 void TransliteratorTest::TestAnyX(void) {
3940 UParseError parseError;
3941 UErrorCode status = U_ZERO_ERROR;
3942 Transliterator* anyLatin =
3943 Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
3944 if (anyLatin==0) {
3945 dataerrln("FAIL: createInstance returned NULL - %s", u_errorName(status));
3946 delete anyLatin;
3947 return;
3948 }
3949
3950 expect(*anyLatin,
3951 CharsToUnicodeString("greek:\\u03B1\\u03B2\\u03BA\\u0391\\u0392\\u039A hiragana:\\u3042\\u3076\\u304F cyrillic:\\u0430\\u0431\\u0446"),
3952 CharsToUnicodeString("greek:abkABK hiragana:abuku cyrillic:abc"));
3953
3954 delete anyLatin;
3955 }
3956
3957 /**
3958 * Test Any-X transliterators with sample letters from all scripts.
3959 */
TestAny(void)3960 void TransliteratorTest::TestAny(void) {
3961 UErrorCode status = U_ZERO_ERROR;
3962 // Note: there is a lot of implict construction of UnicodeStrings from (char *) in
3963 // function call parameters going on in this test.
3964 UnicodeSet alphabetic("[:alphabetic:]", status);
3965 if (U_FAILURE(status)) {
3966 dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
3967 return;
3968 }
3969 alphabetic.freeze();
3970
3971 UnicodeString testString;
3972 for (int32_t i = 0; i < USCRIPT_CODE_LIMIT; i++) {
3973 const char *scriptName = uscript_getShortName((UScriptCode)i);
3974 if (scriptName == NULL) {
3975 errln("Failure: file %s, line %d: Script Code %d is invalid, ", __FILE__, __LINE__, i);
3976 return;
3977 }
3978
3979 UnicodeSet sample;
3980 sample.applyPropertyAlias("script", scriptName, status);
3981 if (U_FAILURE(status)) {
3982 errln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
3983 return;
3984 }
3985 sample.retainAll(alphabetic);
3986 for (int32_t count=0; count<5; count++) {
3987 UChar32 c = sample.charAt(count);
3988 if (c == -1) {
3989 break;
3990 }
3991 testString.append(c);
3992 }
3993 }
3994
3995 UParseError parseError;
3996 Transliterator* anyLatin =
3997 Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
3998 if (U_FAILURE(status)) {
3999 dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
4000 return;
4001 }
4002
4003 logln(UnicodeString("Sample set for Any-Latin: ") + testString);
4004 anyLatin->transliterate(testString);
4005 logln(UnicodeString("Sample result for Any-Latin: ") + testString);
4006 delete anyLatin;
4007 }
4008
4009
4010 /**
4011 * Test the source and target set API. These are only implemented
4012 * for RBT and CompoundTransliterator at this time.
4013 */
TestSourceTargetSet()4014 void TransliteratorTest::TestSourceTargetSet() {
4015 UErrorCode ec = U_ZERO_ERROR;
4016
4017 // Rules
4018 const char* r =
4019 "a > b; "
4020 "r [x{lu}] > q;";
4021
4022 // Expected source
4023 UnicodeSet expSrc("[arx{lu}]", ec);
4024
4025 // Expected target
4026 UnicodeSet expTrg("[bq]", ec);
4027
4028 UParseError pe;
4029 Transliterator* t = Transliterator::createFromRules("test", r, UTRANS_FORWARD, pe, ec);
4030
4031 if (U_FAILURE(ec)) {
4032 delete t;
4033 errln("FAIL: Couldn't set up test");
4034 return;
4035 }
4036
4037 UnicodeSet src; t->getSourceSet(src);
4038 UnicodeSet trg; t->getTargetSet(trg);
4039
4040 if (src == expSrc && trg == expTrg) {
4041 UnicodeString a, b;
4042 logln((UnicodeString)"Ok: " +
4043 r + " => source = " + src.toPattern(a, TRUE) +
4044 ", target = " + trg.toPattern(b, TRUE));
4045 } else {
4046 UnicodeString a, b, c, d;
4047 errln((UnicodeString)"FAIL: " +
4048 r + " => source = " + src.toPattern(a, TRUE) +
4049 ", expected " + expSrc.toPattern(b, TRUE) +
4050 "; target = " + trg.toPattern(c, TRUE) +
4051 ", expected " + expTrg.toPattern(d, TRUE));
4052 }
4053
4054 delete t;
4055 }
4056
4057 /**
4058 * Test handling of Pattern_White_Space, for both RBT and UnicodeSet.
4059 */
TestPatternWhiteSpace()4060 void TransliteratorTest::TestPatternWhiteSpace() {
4061 // Rules
4062 const char* r = "a > \\u200E b;";
4063
4064 UErrorCode ec = U_ZERO_ERROR;
4065 UParseError pe;
4066 Transliterator* t = Transliterator::createFromRules("test", CharsToUnicodeString(r), UTRANS_FORWARD, pe, ec);
4067
4068 if (U_FAILURE(ec)) {
4069 errln("FAIL: Couldn't set up test");
4070 } else {
4071 expect(*t, "a", "b");
4072 }
4073 delete t;
4074
4075 // UnicodeSet
4076 ec = U_ZERO_ERROR;
4077 UnicodeSet set(CharsToUnicodeString("[a \\u200E]"), ec);
4078
4079 if (U_FAILURE(ec)) {
4080 errln("FAIL: Couldn't set up test");
4081 } else {
4082 if (set.contains(0x200E)) {
4083 errln("FAIL: U+200E not being ignored by UnicodeSet");
4084 }
4085 }
4086 }
4087 //======================================================================
4088 // this method is in TestUScript.java
4089 //======================================================================
TestAllCodepoints()4090 void TransliteratorTest::TestAllCodepoints(){
4091 UScriptCode code= USCRIPT_INVALID_CODE;
4092 char id[256]={'\0'};
4093 char abbr[256]={'\0'};
4094 char newId[256]={'\0'};
4095 char newAbbrId[256]={'\0'};
4096 char oldId[256]={'\0'};
4097 char oldAbbrId[256]={'\0'};
4098
4099 UErrorCode status =U_ZERO_ERROR;
4100 UParseError pe;
4101
4102 for(uint32_t i = 0; i<=0x10ffff; i++){
4103 code = uscript_getScript(i,&status);
4104 if(code == USCRIPT_INVALID_CODE){
4105 dataerrln("uscript_getScript for codepoint \\U%08X failed.", i);
4106 }
4107 const char* myId = uscript_getName(code);
4108 if(!myId) {
4109 dataerrln("Valid script code returned NULL name. Check your data!");
4110 return;
4111 }
4112 uprv_strcpy(id,myId);
4113 uprv_strcpy(abbr,uscript_getShortName(code));
4114
4115 uprv_strcpy(newId,"[:");
4116 uprv_strcat(newId,id);
4117 uprv_strcat(newId,":];NFD");
4118
4119 uprv_strcpy(newAbbrId,"[:");
4120 uprv_strcat(newAbbrId,abbr);
4121 uprv_strcat(newAbbrId,":];NFD");
4122
4123 if(uprv_strcmp(newId,oldId)!=0){
4124 Transliterator* t = Transliterator::createInstance(newId,UTRANS_FORWARD,pe,status);
4125 if(t==NULL || U_FAILURE(status)){
4126 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4127 }
4128 delete t;
4129 }
4130 if(uprv_strcmp(newAbbrId,oldAbbrId)!=0){
4131 Transliterator* t = Transliterator::createInstance(newAbbrId,UTRANS_FORWARD,pe,status);
4132 if(t==NULL || U_FAILURE(status)){
4133 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4134 }
4135 delete t;
4136 }
4137 uprv_strcpy(oldId,newId);
4138 uprv_strcpy(oldAbbrId, newAbbrId);
4139
4140 }
4141
4142 }
4143
4144 #define TEST_TRANSLIT_ID(id, cls) UPRV_BLOCK_MACRO_BEGIN { \
4145 UErrorCode ec = U_ZERO_ERROR; \
4146 Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); \
4147 if (U_FAILURE(ec)) { \
4148 dataerrln("FAIL: Couldn't create %s - %s", id, u_errorName(ec)); \
4149 } else { \
4150 if (t->getDynamicClassID() != cls::getStaticClassID()) { \
4151 errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4152 } \
4153 /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4154 } \
4155 delete t; \
4156 } UPRV_BLOCK_MACRO_END
4157
4158 #define TEST_TRANSLIT_RULE(rule, cls) UPRV_BLOCK_MACRO_BEGIN { \
4159 UErrorCode ec = U_ZERO_ERROR; \
4160 UParseError pe; \
4161 Transliterator* t = Transliterator::createFromRules("_", rule, UTRANS_FORWARD, pe, ec); \
4162 if (U_FAILURE(ec)) { \
4163 errln("FAIL: Couldn't create " rule); \
4164 } else { \
4165 if (t->getDynamicClassID() != cls ::getStaticClassID()) { \
4166 errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4167 } \
4168 /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4169 } \
4170 delete t; \
4171 } UPRV_BLOCK_MACRO_END
4172
TestBoilerplate()4173 void TransliteratorTest::TestBoilerplate() {
4174 TEST_TRANSLIT_ID("Any-Latin", AnyTransliterator);
4175 TEST_TRANSLIT_ID("Any-Hex", EscapeTransliterator);
4176 TEST_TRANSLIT_ID("Hex-Any", UnescapeTransliterator);
4177 TEST_TRANSLIT_ID("Lower", LowercaseTransliterator);
4178 TEST_TRANSLIT_ID("Upper", UppercaseTransliterator);
4179 TEST_TRANSLIT_ID("Title", TitlecaseTransliterator);
4180 TEST_TRANSLIT_ID("Null", NullTransliterator);
4181 TEST_TRANSLIT_ID("Remove", RemoveTransliterator);
4182 TEST_TRANSLIT_ID("Any-Name", UnicodeNameTransliterator);
4183 TEST_TRANSLIT_ID("Name-Any", NameUnicodeTransliterator);
4184 TEST_TRANSLIT_ID("NFD", NormalizationTransliterator);
4185 TEST_TRANSLIT_ID("Latin-Greek", CompoundTransliterator);
4186 TEST_TRANSLIT_RULE("a>b;", RuleBasedTransliterator);
4187 }
4188
TestAlternateSyntax()4189 void TransliteratorTest::TestAlternateSyntax() {
4190 // U+2206 == &
4191 // U+2190 == <
4192 // U+2192 == >
4193 // U+2194 == <>
4194 expect(CharsToUnicodeString("a \\u2192 x; b \\u2190 y; c \\u2194 z"),
4195 "abc",
4196 "xbz");
4197 expect(CharsToUnicodeString("([:^ASCII:]) \\u2192 \\u2206Name($1);"),
4198 CharsToUnicodeString("<=\\u2190; >=\\u2192; <>=\\u2194; &=\\u2206"),
4199 UNICODE_STRING_SIMPLE("<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}"));
4200 }
4201
4202 static const char* BEGIN_END_RULES[] = {
4203 // [0]
4204 "abc > xy;"
4205 "aba > z;",
4206
4207 // [1]
4208 /*
4209 "::BEGIN;"
4210 "abc > xy;"
4211 "::END;"
4212 "::BEGIN;"
4213 "aba > z;"
4214 "::END;",
4215 */
4216 "", // test case commented out below, this is here to keep from messing up the indexes
4217
4218 // [2]
4219 /*
4220 "abc > xy;"
4221 "::BEGIN;"
4222 "aba > z;"
4223 "::END;",
4224 */
4225 "", // test case commented out below, this is here to keep from messing up the indexes
4226
4227 // [3]
4228 /*
4229 "::BEGIN;"
4230 "abc > xy;"
4231 "::END;"
4232 "aba > z;",
4233 */
4234 "", // test case commented out below, this is here to keep from messing up the indexes
4235
4236 // [4]
4237 "abc > xy;"
4238 "::Null;"
4239 "aba > z;",
4240
4241 // [5]
4242 "::Upper;"
4243 "ABC > xy;"
4244 "AB > x;"
4245 "C > z;"
4246 "::Upper;"
4247 "XYZ > p;"
4248 "XY > q;"
4249 "Z > r;"
4250 "::Upper;",
4251
4252 // [6]
4253 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4254 "$delim = [\\-$ws];"
4255 "$ws $delim* > ' ';"
4256 "'-' $delim* > '-';",
4257
4258 // [7]
4259 "::Null;"
4260 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4261 "$delim = [\\-$ws];"
4262 "$ws $delim* > ' ';"
4263 "'-' $delim* > '-';",
4264
4265 // [8]
4266 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4267 "$delim = [\\-$ws];"
4268 "$ws $delim* > ' ';"
4269 "'-' $delim* > '-';"
4270 "::Null;",
4271
4272 // [9]
4273 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4274 "$delim = [\\-$ws];"
4275 "::Null;"
4276 "$ws $delim* > ' ';"
4277 "'-' $delim* > '-';",
4278
4279 // [10]
4280 /*
4281 "::BEGIN;"
4282 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4283 "$delim = [\\-$ws];"
4284 "::END;"
4285 "$ws $delim* > ' ';"
4286 "'-' $delim* > '-';",
4287 */
4288 "", // test case commented out below, this is here to keep from messing up the indexes
4289
4290 // [11]
4291 /*
4292 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4293 "$delim = [\\-$ws];"
4294 "::BEGIN;"
4295 "$ws $delim* > ' ';"
4296 "'-' $delim* > '-';"
4297 "::END;",
4298 */
4299 "", // test case commented out below, this is here to keep from messing up the indexes
4300
4301 // [12]
4302 /*
4303 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4304 "$delim = [\\-$ws];"
4305 "$ab = [ab];"
4306 "::BEGIN;"
4307 "$ws $delim* > ' ';"
4308 "'-' $delim* > '-';"
4309 "::END;"
4310 "::BEGIN;"
4311 "$ab { ' ' } $ab > '-';"
4312 "c { ' ' > ;"
4313 "::END;"
4314 "::BEGIN;"
4315 "'a-a' > a\\%|a;"
4316 "::END;",
4317 */
4318 "", // test case commented out below, this is here to keep from messing up the indexes
4319
4320 // [13]
4321 "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4322 "$delim = [\\-$ws];"
4323 "$ab = [ab];"
4324 "::Null;"
4325 "$ws $delim* > ' ';"
4326 "'-' $delim* > '-';"
4327 "::Null;"
4328 "$ab { ' ' } $ab > '-';"
4329 "c { ' ' > ;"
4330 "::Null;"
4331 "'a-a' > a\\%|a;",
4332
4333 // [14]
4334 /*
4335 "::[abc];"
4336 "::BEGIN;"
4337 "abc > xy;"
4338 "::END;"
4339 "::BEGIN;"
4340 "aba > yz;"
4341 "::END;"
4342 "::Upper;",
4343 */
4344 "", // test case commented out below, this is here to keep from messing up the indexes
4345
4346 // [15]
4347 "::[abc];"
4348 "abc > xy;"
4349 "::Null;"
4350 "aba > yz;"
4351 "::Upper;",
4352
4353 // [16]
4354 /*
4355 "::[abc];"
4356 "::BEGIN;"
4357 "abc <> xy;"
4358 "::END;"
4359 "::BEGIN;"
4360 "aba <> yz;"
4361 "::END;"
4362 "::Upper(Lower);"
4363 "::([XYZ]);"
4364 */
4365 "", // test case commented out below, this is here to keep from messing up the indexes
4366
4367 // [17]
4368 "::[abc];"
4369 "abc <> xy;"
4370 "::Null;"
4371 "aba <> yz;"
4372 "::Upper(Lower);"
4373 "::([XYZ]);"
4374 };
4375
4376 /*
4377 (This entire test is commented out below and will need some heavy revision when we re-add
4378 the ::BEGIN/::END stuff)
4379 static const char* BOGUS_BEGIN_END_RULES[] = {
4380 // [7]
4381 "::BEGIN;"
4382 "abc > xy;"
4383 "::BEGIN;"
4384 "aba > z;"
4385 "::END;"
4386 "::END;",
4387
4388 // [8]
4389 "abc > xy;"
4390 " aba > z;"
4391 "::END;",
4392
4393 // [9]
4394 "::BEGIN;"
4395 "::Upper;"
4396 "::END;"
4397 };
4398 static const int32_t BOGUS_BEGIN_END_RULES_length = UPRV_LENGTHOF(BOGUS_BEGIN_END_RULES);
4399 */
4400
4401 static const char* BEGIN_END_TEST_CASES[] = {
4402 // rules input expected output
4403 BEGIN_END_RULES[0], "abc ababc aba", "xy zbc z",
4404 // BEGIN_END_RULES[1], "abc ababc aba", "xy abxy z",
4405 // BEGIN_END_RULES[2], "abc ababc aba", "xy abxy z",
4406 // BEGIN_END_RULES[3], "abc ababc aba", "xy abxy z",
4407 BEGIN_END_RULES[4], "abc ababc aba", "xy abxy z",
4408 BEGIN_END_RULES[5], "abccabaacababcbc", "PXAARXQBR",
4409
4410 BEGIN_END_RULES[6], "e e - e---e- e", "e e e-e-e",
4411 BEGIN_END_RULES[7], "e e - e---e- e", "e e e-e-e",
4412 BEGIN_END_RULES[8], "e e - e---e- e", "e e e-e-e",
4413 BEGIN_END_RULES[9], "e e - e---e- e", "e e e-e-e",
4414 // BEGIN_END_RULES[10], "e e - e---e- e", "e e e-e-e",
4415 // BEGIN_END_RULES[11], "e e - e---e- e", "e e e-e-e",
4416 // BEGIN_END_RULES[12], "e e - e---e- e", "e e e-e-e",
4417 // BEGIN_END_RULES[12], "a a a a", "a%a%a%a",
4418 // BEGIN_END_RULES[12], "a a-b c b a", "a%a-b cb-a",
4419 BEGIN_END_RULES[13], "e e - e---e- e", "e e e-e-e",
4420 BEGIN_END_RULES[13], "a a a a", "a%a%a%a",
4421 BEGIN_END_RULES[13], "a a-b c b a", "a%a-b cb-a",
4422
4423 // BEGIN_END_RULES[14], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4424 BEGIN_END_RULES[15], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4425 // BEGIN_END_RULES[16], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4426 BEGIN_END_RULES[17], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ"
4427 };
4428 static const int32_t BEGIN_END_TEST_CASES_length = UPRV_LENGTHOF(BEGIN_END_TEST_CASES);
4429
TestBeginEnd()4430 void TransliteratorTest::TestBeginEnd() {
4431 // run through the list of test cases above
4432 int32_t i = 0;
4433 for (i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4434 expect((UnicodeString)"Test case #" + (i / 3),
4435 UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4436 UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4437 UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4438 }
4439
4440 // instantiate the one reversible rule set in the reverse direction and make sure it does the right thing
4441 UParseError parseError;
4442 UErrorCode status = U_ZERO_ERROR;
4443 Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4444 UTRANS_REVERSE, parseError, status);
4445 if (reversed == 0 || U_FAILURE(status)) {
4446 reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4447 } else {
4448 expect(*reversed, UnicodeString("xy XY XYZ yz YZ"), UnicodeString("xy abc xaba yz aba"));
4449 }
4450 delete reversed;
4451
4452 // finally, run through the list of syntactically-ill-formed rule sets above and make sure
4453 // that all of them cause errors
4454 /*
4455 (commented out until we have the real ::BEGIN/::END stuff in place
4456 for (i = 0; i < BOGUS_BEGIN_END_RULES_length; i++) {
4457 UParseError parseError;
4458 UErrorCode status = U_ZERO_ERROR;
4459 Transliterator* t = Transliterator::createFromRules("foo", UnicodeString(BOGUS_BEGIN_END_RULES[i]),
4460 UTRANS_FORWARD, parseError, status);
4461 if (!U_FAILURE(status)) {
4462 delete t;
4463 errln((UnicodeString)"Should have gotten syntax error from " + BOGUS_BEGIN_END_RULES[i]);
4464 }
4465 }
4466 */
4467 }
4468
TestBeginEndToRules()4469 void TransliteratorTest::TestBeginEndToRules() {
4470 // run through the same list of test cases we used above, but this time, instead of just
4471 // instantiating a Transliterator from the rules and running the test against it, we instantiate
4472 // a Transliterator from the rules, do toRules() on it, instantiate a Transliterator from
4473 // the resulting set of rules, and make sure that the generated rule set is semantically equivalent
4474 // to (i.e., does the same thing as) the original rule set
4475 for (int32_t i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4476 UParseError parseError;
4477 UErrorCode status = U_ZERO_ERROR;
4478 Transliterator* t = Transliterator::createFromRules("--", UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4479 UTRANS_FORWARD, parseError, status);
4480 if (U_FAILURE(status)) {
4481 reportParseError(UnicodeString("FAIL: Couldn't create transliterator"), parseError, status);
4482 } else {
4483 UnicodeString rules;
4484 t->toRules(rules, TRUE);
4485 Transliterator* t2 = Transliterator::createFromRules((UnicodeString)"Test case #" + (i / 3), rules,
4486 UTRANS_FORWARD, parseError, status);
4487 if (U_FAILURE(status)) {
4488 reportParseError(UnicodeString("FAIL: Couldn't create transliterator from generated rules"),
4489 parseError, status);
4490 delete t;
4491 } else {
4492 expect(*t2,
4493 UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4494 UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4495 delete t;
4496 delete t2;
4497 }
4498 }
4499 }
4500
4501 // do the same thing for the reversible test case
4502 UParseError parseError;
4503 UErrorCode status = U_ZERO_ERROR;
4504 Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4505 UTRANS_REVERSE, parseError, status);
4506 if (U_FAILURE(status)) {
4507 reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4508 } else {
4509 UnicodeString rules;
4510 reversed->toRules(rules, FALSE);
4511 Transliterator* reversed2 = Transliterator::createFromRules("Reversed", rules, UTRANS_FORWARD,
4512 parseError, status);
4513 if (U_FAILURE(status)) {
4514 reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator from generated rules"),
4515 parseError, status);
4516 delete reversed;
4517 } else {
4518 expect(*reversed2,
4519 UnicodeString("xy XY XYZ yz YZ"),
4520 UnicodeString("xy abc xaba yz aba"));
4521 delete reversed;
4522 delete reversed2;
4523 }
4524 }
4525 }
4526
TestRegisterAlias()4527 void TransliteratorTest::TestRegisterAlias() {
4528 UnicodeString longID("Lower;[aeiou]Upper");
4529 UnicodeString shortID("Any-CapVowels");
4530 UnicodeString reallyShortID("CapVowels");
4531
4532 Transliterator::registerAlias(shortID, longID);
4533
4534 UErrorCode err = U_ZERO_ERROR;
4535 Transliterator* t1 = Transliterator::createInstance(longID, UTRANS_FORWARD, err);
4536 if (U_FAILURE(err)) {
4537 errln("Failed to instantiate transliterator with long ID");
4538 Transliterator::unregister(shortID);
4539 return;
4540 }
4541 Transliterator* t2 = Transliterator::createInstance(reallyShortID, UTRANS_FORWARD, err);
4542 if (U_FAILURE(err)) {
4543 errln("Failed to instantiate transliterator with short ID");
4544 delete t1;
4545 Transliterator::unregister(shortID);
4546 return;
4547 }
4548
4549 if (t1->getID() != longID)
4550 errln("Transliterator instantiated with long ID doesn't have long ID");
4551 if (t2->getID() != reallyShortID)
4552 errln("Transliterator instantiated with short ID doesn't have short ID");
4553
4554 UnicodeString rules1;
4555 UnicodeString rules2;
4556
4557 t1->toRules(rules1, TRUE);
4558 t2->toRules(rules2, TRUE);
4559 if (rules1 != rules2)
4560 errln("Alias transliterators aren't the same");
4561
4562 delete t1;
4563 delete t2;
4564 Transliterator::unregister(shortID);
4565
4566 t1 = Transliterator::createInstance(shortID, UTRANS_FORWARD, err);
4567 if (U_SUCCESS(err)) {
4568 errln("Instantiation with short ID succeeded after short ID was unregistered");
4569 delete t1;
4570 }
4571
4572 // try the same thing again, but this time with something other than
4573 // an instance of CompoundTransliterator
4574 UnicodeString realID("Latin-Greek");
4575 UnicodeString fakeID("Latin-dlgkjdflkjdl");
4576 Transliterator::registerAlias(fakeID, realID);
4577
4578 err = U_ZERO_ERROR;
4579 t1 = Transliterator::createInstance(realID, UTRANS_FORWARD, err);
4580 if (U_FAILURE(err)) {
4581 dataerrln("Failed to instantiate transliterator with real ID - %s", u_errorName(err));
4582 Transliterator::unregister(realID);
4583 return;
4584 }
4585 t2 = Transliterator::createInstance(fakeID, UTRANS_FORWARD, err);
4586 if (U_FAILURE(err)) {
4587 errln("Failed to instantiate transliterator with fake ID");
4588 delete t1;
4589 Transliterator::unregister(realID);
4590 return;
4591 }
4592
4593 t1->toRules(rules1, TRUE);
4594 t2->toRules(rules2, TRUE);
4595 if (rules1 != rules2)
4596 errln("Alias transliterators aren't the same");
4597
4598 delete t1;
4599 delete t2;
4600 Transliterator::unregister(fakeID);
4601 }
4602
TestRuleStripping()4603 void TransliteratorTest::TestRuleStripping() {
4604 /*
4605 #
4606 \uE001>\u0C01; # SIGN
4607 */
4608 static const UChar rule[] = {
4609 0x0023,0x0020,0x000D,0x000A,
4610 0xE001,0x003E,0x0C01,0x003B,0x0020,0x0023,0x0020,0x0053,0x0049,0x0047,0x004E,0
4611 };
4612 static const UChar expectedRule[] = {
4613 0xE001,0x003E,0x0C01,0x003B,0
4614 };
4615 UChar result[UPRV_LENGTHOF(rule)];
4616 UErrorCode status = U_ZERO_ERROR;
4617 int32_t len = utrans_stripRules(rule, UPRV_LENGTHOF(rule), result, &status);
4618 if (len != u_strlen(expectedRule)) {
4619 errln("utrans_stripRules return len = %d", len);
4620 }
4621 if (u_strncmp(expectedRule, result, len) != 0) {
4622 errln("utrans_stripRules did not return expected string");
4623 }
4624 }
4625
4626 /**
4627 * Test the Halfwidth-Fullwidth transliterator (ticket 6281).
4628 */
TestHalfwidthFullwidth(void)4629 void TransliteratorTest::TestHalfwidthFullwidth(void) {
4630 UParseError parseError;
4631 UErrorCode status = U_ZERO_ERROR;
4632 Transliterator* hf = Transliterator::createInstance("Halfwidth-Fullwidth", UTRANS_FORWARD, parseError, status);
4633 Transliterator* fh = Transliterator::createInstance("Fullwidth-Halfwidth", UTRANS_FORWARD, parseError, status);
4634 if (hf == 0 || fh == 0) {
4635 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4636 delete hf;
4637 delete fh;
4638 return;
4639 }
4640
4641 // Array of 2n items
4642 // Each item is
4643 // "hf"|"fh"|"both",
4644 // <Halfwidth>,
4645 // <Fullwidth>
4646 const char* DATA[] = {
4647 "both",
4648 "\\uFFE9\\uFFEA\\uFFEB\\uFFEC\\u0061\\uFF71\\u00AF\\u0020",
4649 "\\u2190\\u2191\\u2192\\u2193\\uFF41\\u30A2\\uFFE3\\u3000",
4650 };
4651 int32_t DATA_length = UPRV_LENGTHOF(DATA);
4652
4653 for (int32_t i=0; i<DATA_length; i+=3) {
4654 UnicodeString h = CharsToUnicodeString(DATA[i+1]);
4655 UnicodeString f = CharsToUnicodeString(DATA[i+2]);
4656 switch (*DATA[i]) {
4657 case 0x68: //'h': // Halfwidth-Fullwidth only
4658 expect(*hf, h, f);
4659 break;
4660 case 0x66: //'f': // Fullwidth-Halfwidth only
4661 expect(*fh, f, h);
4662 break;
4663 case 0x62: //'b': // both directions
4664 expect(*hf, h, f);
4665 expect(*fh, f, h);
4666 break;
4667 }
4668 }
4669 delete hf;
4670 delete fh;
4671 }
4672
4673
4674 /**
4675 * Test Thai. The text is the first paragraph of "What is Unicode" from the Unicode.org web site.
4676 * TODO: confirm that the expected results are correct.
4677 * For now, test just confirms that C++ and Java give identical results.
4678 */
TestThai(void)4679 void TransliteratorTest::TestThai(void) {
4680 #if !UCONFIG_NO_BREAK_ITERATION
4681 UParseError parseError;
4682 UErrorCode status = U_ZERO_ERROR;
4683 Transliterator* tr = Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
4684 if (tr == 0) {
4685 dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4686 return;
4687 }
4688 if (U_FAILURE(status)) {
4689 errln("FAIL: createInstance failed with %s", u_errorName(status));
4690 return;
4691 }
4692 const char *thaiText =
4693 "\\u0e42\\u0e14\\u0e22\\u0e1e\\u0e37\\u0e49\\u0e19\\u0e10\\u0e32\\u0e19\\u0e41\\u0e25\\u0e49\\u0e27, \\u0e04\\u0e2d"
4694 "\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d\\u0e23\\u0e4c\\u0e08\\u0e30\\u0e40\\u0e01\\u0e35\\u0e48\\u0e22"
4695 "\\u0e27\\u0e02\\u0e49\\u0e2d\\u0e07\\u0e01\\u0e31\\u0e1a\\u0e40\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e02\\u0e2d"
4696 "\\u0e07\\u0e15\\u0e31\\u0e27\\u0e40\\u0e25\\u0e02. \\u0e04\\u0e2d\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d"
4697 "\\u0e23\\u0e4c\\u0e08\\u0e31\\u0e14\\u0e40\\u0e01\\u0e47\\u0e1a\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29"
4698 "\\u0e23\\u0e41\\u0e25\\u0e30\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30\\u0e2d\\u0e37\\u0e48\\u0e19\\u0e46 \\u0e42"
4699 "\\u0e14\\u0e22\\u0e01\\u0e32\\u0e23\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25"
4700 "\\u0e02\\u0e43\\u0e2b\\u0e49\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e41\\u0e15\\u0e48\\u0e25\\u0e30\\u0e15"
4701 "\\u0e31\\u0e27. \\u0e01\\u0e48\\u0e2d\\u0e19\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48\\u0e4a Unicode \\u0e08"
4702 "\\u0e30\\u0e16\\u0e39\\u0e01\\u0e2a\\u0e23\\u0e49\\u0e32\\u0e07\\u0e02\\u0e36\\u0e49\\u0e19, \\u0e44\\u0e14\\u0e49"
4703 "\\u0e21\\u0e35\\u0e23\\u0e30\\u0e1a\\u0e1a encoding \\u0e2d\\u0e22\\u0e39\\u0e48\\u0e2b\\u0e25\\u0e32\\u0e22\\u0e23"
4704 "\\u0e49\\u0e2d\\u0e22\\u0e23\\u0e30\\u0e1a\\u0e1a\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e01\\u0e32\\u0e23"
4705 "\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25\\u0e02\\u0e40\\u0e2b\\u0e25\\u0e48"
4706 "\\u0e32\\u0e19\\u0e35\\u0e49. \\u0e44\\u0e21\\u0e48\\u0e21\\u0e35 encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48"
4707 "\\u0e21\\u0e35\\u0e08\\u0e33\\u0e19\\u0e27\\u0e19\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30"
4708 "\\u0e21\\u0e32\\u0e01\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d: \\u0e22\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d"
4709 "\\u0e22\\u0e48\\u0e32\\u0e07\\u0e40\\u0e0a\\u0e48\\u0e19, \\u0e40\\u0e09\\u0e1e\\u0e32\\u0e30\\u0e43\\u0e19\\u0e01"
4710 "\\u0e25\\u0e38\\u0e48\\u0e21\\u0e2a\\u0e2b\\u0e20\\u0e32\\u0e1e\\u0e22\\u0e38\\u0e42\\u0e23\\u0e1b\\u0e40\\u0e1e"
4711 "\\u0e35\\u0e22\\u0e07\\u0e41\\u0e2b\\u0e48\\u0e07\\u0e40\\u0e14\\u0e35\\u0e22\\u0e27 \\u0e01\\u0e47\\u0e15\\u0e49"
4712 "\\u0e2d\\u0e07\\u0e01\\u0e32\\u0e23\\u0e2b\\u0e25\\u0e32\\u0e22 encoding \\u0e43\\u0e19\\u0e01\\u0e32\\u0e23\\u0e04"
4713 "\\u0e23\\u0e2d\\u0e1a\\u0e04\\u0e25\\u0e38\\u0e21\\u0e17\\u0e38\\u0e01\\u0e20\\u0e32\\u0e29\\u0e32\\u0e43\\u0e19"
4714 "\\u0e01\\u0e25\\u0e38\\u0e48\\u0e21. \\u0e2b\\u0e23\\u0e37\\u0e2d\\u0e41\\u0e21\\u0e49\\u0e41\\u0e15\\u0e48\\u0e43"
4715 "\\u0e19\\u0e20\\u0e32\\u0e29\\u0e32\\u0e40\\u0e14\\u0e35\\u0e48\\u0e22\\u0e27 \\u0e40\\u0e0a\\u0e48\\u0e19 \\u0e20"
4716 "\\u0e32\\u0e29\\u0e32\\u0e2d\\u0e31\\u0e07\\u0e01\\u0e24\\u0e29 \\u0e01\\u0e47\\u0e44\\u0e21\\u0e48\\u0e21\\u0e35"
4717 " encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d\\u0e2a\\u0e33\\u0e2b"
4718 "\\u0e23\\u0e31\\u0e1a\\u0e17\\u0e38\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29\\u0e23, \\u0e40\\u0e04"
4719 "\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e27\\u0e23\\u0e23\\u0e04\\u0e15\\u0e2d\\u0e19"
4720 " \\u0e41\\u0e25\\u0e30\\u0e2a\\u0e31\\u0e0d\\u0e25\\u0e31\\u0e01\\u0e29\\u0e13\\u0e4c\\u0e17\\u0e32\\u0e07\\u0e40"
4721 "\\u0e17\\u0e04\\u0e19\\u0e34\\u0e04\\u0e17\\u0e35\\u0e48\\u0e43\\u0e0a\\u0e49\\u0e01\\u0e31\\u0e19\\u0e2d\\u0e22"
4722 "\\u0e39\\u0e48\\u0e17\\u0e31\\u0e48\\u0e27\\u0e44\\u0e1b.";
4723
4724 const char *latinText =
4725 "doy ph\\u1ee5\\u0304\\u0302n \\u1e6d\\u0304h\\u0101n l\\u00e6\\u0302w, khxmphiwtexr\\u0312 ca ke\\u012b\\u0300"
4726 "ywk\\u0304\\u0125xng k\\u1ea1b re\\u1ee5\\u0304\\u0300xng k\\u0304hxng t\\u1ea1wlek\\u0304h. khxmphiwtexr"
4727 "\\u0312 c\\u1ea1d k\\u0115b t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r l\\u00e6a x\\u1ea1kk\\u0304h ra x\\u1ee5\\u0304"
4728 "\\u0300n\\u00ab doy k\\u0101r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304\\u0131\\u0302 s\\u0304"
4729 "\\u1ea3h\\u0304r\\u1ea1b t\\u00e6\\u0300la t\\u1ea1w. k\\u0300xn h\\u0304n\\u0302\\u0101 th\\u012b\\u0300\\u0301"
4730 " Unicode ca t\\u0304h\\u016bk s\\u0304r\\u0302\\u0101ng k\\u0304h\\u1ee5\\u0302n, d\\u1ecb\\u0302 m\\u012b "
4731 "rabb encoding xy\\u016b\\u0300 h\\u0304l\\u0101y r\\u0302xy rabb s\\u0304\\u1ea3h\\u0304r\\u1ea1b k\\u0101"
4732 "r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304el\\u0300\\u0101 n\\u012b\\u0302. m\\u1ecb\\u0300m"
4733 "\\u012b encoding d\\u0131 th\\u012b\\u0300 m\\u012b c\\u1ea3nwn t\\u1ea1w x\\u1ea1kk\\u0304hra m\\u0101k p"
4734 "he\\u012byng phx: yk t\\u1ea1wx\\u1ef3\\u0101ng ch\\u00e8n, c\\u0304heph\\u0101a n\\u0131 kl\\u00f9m s\\u0304"
4735 "h\\u0304p\\u0323h\\u0101ph yurop phe\\u012byng h\\u0304\\u00e6\\u0300ng de\\u012byw k\\u0306 t\\u0302xngk\\u0101"
4736 "r h\\u0304l\\u0101y encoding n\\u0131 k\\u0101r khrxbkhlum thuk p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 n\\u0131"
4737 " kl\\u00f9m. h\\u0304r\\u1ee5\\u0304x m\\u00e6\\u0302t\\u00e6\\u0300 n\\u0131 p\\u0323h\\u0101s\\u0304\\u02b9"
4738 "\\u0101 de\\u012b\\u0300yw ch\\u00e8n p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 x\\u1ea1ngkvs\\u0304\\u02b9 k\\u0306"
4739 " m\\u1ecb\\u0300m\\u012b encoding d\\u0131 th\\u012b\\u0300 phe\\u012byng phx s\\u0304\\u1ea3h\\u0304r\\u1ea1"
4740 "b thuk t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r, kher\\u1ee5\\u0304\\u0300xngh\\u0304m\\u0101y wrrkh txn l\\u00e6"
4741 "a s\\u0304\\u1ea1\\u1ef5l\\u1ea1ks\\u0304\\u02b9\\u1e47\\u0312 th\\u0101ng thekhnikh th\\u012b\\u0300 ch\\u0131"
4742 "\\u0302 k\\u1ea1n xy\\u016b\\u0300 th\\u1ea1\\u0300wp\\u1ecb.";
4743
4744
4745 UnicodeString xlitText(thaiText);
4746 xlitText = xlitText.unescape();
4747 tr->transliterate(xlitText);
4748
4749 UnicodeString expectedText(latinText);
4750 expectedText = expectedText.unescape();
4751 expect(*tr, xlitText, expectedText);
4752
4753 delete tr;
4754 #endif
4755 }
4756
4757
4758 //======================================================================
4759 // Support methods
4760 //======================================================================
expectT(const UnicodeString & id,const UnicodeString & source,const UnicodeString & expectedResult)4761 void TransliteratorTest::expectT(const UnicodeString& id,
4762 const UnicodeString& source,
4763 const UnicodeString& expectedResult) {
4764 UErrorCode ec = U_ZERO_ERROR;
4765 UParseError pe;
4766 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
4767 if (U_FAILURE(ec)) {
4768 errln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(ec));
4769 delete t;
4770 return;
4771 }
4772 expect(*t, source, expectedResult);
4773 delete t;
4774 }
4775
reportParseError(const UnicodeString & message,const UParseError & parseError,const UErrorCode & status)4776 void TransliteratorTest::reportParseError(const UnicodeString& message,
4777 const UParseError& parseError,
4778 const UErrorCode& status) {
4779 dataerrln(message +
4780 /*", parse error " + parseError.code +*/
4781 ", line " + parseError.line +
4782 ", offset " + parseError.offset +
4783 ", pre-context " + prettify(parseError.preContext, TRUE) +
4784 ", post-context " + prettify(parseError.postContext,TRUE) +
4785 ", Error: " + u_errorName(status));
4786 }
4787
expect(const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4788 void TransliteratorTest::expect(const UnicodeString& rules,
4789 const UnicodeString& source,
4790 const UnicodeString& expectedResult,
4791 UTransPosition *pos) {
4792 expect("<ID>", rules, source, expectedResult, pos);
4793 }
4794
expect(const UnicodeString & id,const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4795 void TransliteratorTest::expect(const UnicodeString& id,
4796 const UnicodeString& rules,
4797 const UnicodeString& source,
4798 const UnicodeString& expectedResult,
4799 UTransPosition *pos) {
4800 UErrorCode status = U_ZERO_ERROR;
4801 UParseError parseError;
4802 Transliterator* t = Transliterator::createFromRules(id, rules, UTRANS_FORWARD, parseError, status);
4803 if (U_FAILURE(status)) {
4804 reportParseError(UnicodeString("Couldn't create transliterator from ") + rules, parseError, status);
4805 } else {
4806 expect(*t, source, expectedResult, pos);
4807 }
4808 delete t;
4809 }
4810
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,const Transliterator & reverseTransliterator)4811 void TransliteratorTest::expect(const Transliterator& t,
4812 const UnicodeString& source,
4813 const UnicodeString& expectedResult,
4814 const Transliterator& reverseTransliterator) {
4815 expect(t, source, expectedResult);
4816 expect(reverseTransliterator, expectedResult, source);
4817 }
4818
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4819 void TransliteratorTest::expect(const Transliterator& t,
4820 const UnicodeString& source,
4821 const UnicodeString& expectedResult,
4822 UTransPosition *pos) {
4823 if (pos == 0) {
4824 UnicodeString result(source);
4825 t.transliterate(result);
4826 expectAux(t.getID() + ":String", source, result, expectedResult);
4827 }
4828 UTransPosition index={0, 0, 0, 0};
4829 if (pos != 0) {
4830 index = *pos;
4831 }
4832
4833 UnicodeString rsource(source);
4834 if (pos == 0) {
4835 t.transliterate(rsource);
4836 } else {
4837 // Do it all at once -- below we do it incrementally
4838 t.finishTransliteration(rsource, *pos);
4839 }
4840 expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
4841
4842 // Test keyboard (incremental) transliteration -- this result
4843 // must be the same after we finalize (see below).
4844 UnicodeString log;
4845 rsource.remove();
4846 if (pos != 0) {
4847 rsource = source;
4848 formatInput(log, rsource, index);
4849 log.append(" -> ");
4850 UErrorCode status = U_ZERO_ERROR;
4851 t.transliterate(rsource, index, status);
4852 formatInput(log, rsource, index);
4853 } else {
4854 for (int32_t i=0; i<source.length(); ++i) {
4855 if (i != 0) {
4856 log.append(" + ");
4857 }
4858 log.append(source.charAt(i)).append(" -> ");
4859 UErrorCode status = U_ZERO_ERROR;
4860 t.transliterate(rsource, index, source.charAt(i), status);
4861 formatInput(log, rsource, index);
4862 }
4863 }
4864
4865 // As a final step in keyboard transliteration, we must call
4866 // transliterate to finish off any pending partial matches that
4867 // were waiting for more input.
4868 t.finishTransliteration(rsource, index);
4869 log.append(" => ").append(rsource);
4870
4871 expectAux(t.getID() + ":Keyboard", log,
4872 rsource == expectedResult,
4873 expectedResult);
4874 }
4875
4876
4877 /**
4878 * @param appendTo result is appended to this param.
4879 * @param input the string being transliterated
4880 * @param pos the index struct
4881 */
formatInput(UnicodeString & appendTo,const UnicodeString & input,const UTransPosition & pos)4882 UnicodeString& TransliteratorTest::formatInput(UnicodeString &appendTo,
4883 const UnicodeString& input,
4884 const UTransPosition& pos) {
4885 // Output a string of the form aaa{bbb|ccc|ddd}eee, where
4886 // the {} indicate the context start and limit, and the ||
4887 // indicate the start and limit.
4888 if (0 <= pos.contextStart &&
4889 pos.contextStart <= pos.start &&
4890 pos.start <= pos.limit &&
4891 pos.limit <= pos.contextLimit &&
4892 pos.contextLimit <= input.length()) {
4893
4894 UnicodeString a, b, c, d, e;
4895 input.extractBetween(0, pos.contextStart, a);
4896 input.extractBetween(pos.contextStart, pos.start, b);
4897 input.extractBetween(pos.start, pos.limit, c);
4898 input.extractBetween(pos.limit, pos.contextLimit, d);
4899 input.extractBetween(pos.contextLimit, input.length(), e);
4900 appendTo.append(a).append((UChar)123/*{*/).append(b).
4901 append((UChar)PIPE).append(c).append((UChar)PIPE).append(d).
4902 append((UChar)125/*}*/).append(e);
4903 } else {
4904 appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
4905 pos.contextStart + ", s=" + pos.start + ", l=" +
4906 pos.limit + ", cl=" + pos.contextLimit + "} on " +
4907 input);
4908 }
4909 return appendTo;
4910 }
4911
expectAux(const UnicodeString & tag,const UnicodeString & source,const UnicodeString & result,const UnicodeString & expectedResult)4912 void TransliteratorTest::expectAux(const UnicodeString& tag,
4913 const UnicodeString& source,
4914 const UnicodeString& result,
4915 const UnicodeString& expectedResult) {
4916 expectAux(tag, source + " -> " + result,
4917 result == expectedResult,
4918 expectedResult);
4919 }
4920
expectAux(const UnicodeString & tag,const UnicodeString & summary,UBool pass,const UnicodeString & expectedResult)4921 void TransliteratorTest::expectAux(const UnicodeString& tag,
4922 const UnicodeString& summary, UBool pass,
4923 const UnicodeString& expectedResult) {
4924 if (pass) {
4925 logln(UnicodeString("(")+tag+") " + prettify(summary));
4926 } else {
4927 dataerrln(UnicodeString("FAIL: (")+tag+") "
4928 + prettify(summary)
4929 + ", expected " + prettify(expectedResult));
4930 }
4931 }
4932
4933 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
4934