1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package ohos.global.icu.dev.test.normalizer; 12 13 import java.text.StringCharacterIterator; 14 import java.util.Random; 15 16 import org.junit.Test; 17 import org.junit.runner.RunWith; 18 import org.junit.runners.JUnit4; 19 20 import ohos.global.icu.dev.test.TestFmwk; 21 import ohos.global.icu.impl.Norm2AllModes; 22 import ohos.global.icu.impl.Normalizer2Impl; 23 import ohos.global.icu.impl.USerializedSet; 24 import ohos.global.icu.impl.Utility; 25 import ohos.global.icu.lang.UCharacter; 26 import ohos.global.icu.lang.UCharacterCategory; 27 import ohos.global.icu.lang.UProperty; 28 import ohos.global.icu.text.FilteredNormalizer2; 29 import ohos.global.icu.text.Normalizer; 30 import ohos.global.icu.text.Normalizer2; 31 import ohos.global.icu.text.UCharacterIterator; 32 import ohos.global.icu.text.UTF16; 33 import ohos.global.icu.text.UnicodeSet; 34 import ohos.global.icu.text.UnicodeSetIterator; 35 36 37 38 39 @RunWith(JUnit4.class) 40 public class BasicTest extends TestFmwk { 41 String[][] canonTests = { 42 // Input Decomposed Composed 43 { "cat", "cat", "cat" }, 44 { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", }, 45 46 { "\u1e0a", "D\u0307", "\u1e0a" }, // D-dot_above 47 { "D\u0307", "D\u0307", "\u1e0a" }, // D dot_above 48 49 { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_below dot_above 50 { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_above dot_below 51 { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below dot_above 52 53 { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above 54 { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below 55 56 { "\u1E14", "E\u0304\u0300", "\u1E14" }, // E-macron-grave 57 { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, // E-macron + grave 58 { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, // E-grave + macron 59 60 { "\u212b", "A\u030a", "\u00c5" }, // angstrom_sign 61 { "\u00c5", "A\u030a", "\u00c5" }, // A-ring 62 63 { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" }, 64 { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" }, 65 66 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated with 3.0 67 { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, //updated with 3.0 68 69 { "Henry IV", "Henry IV", "Henry IV" }, 70 { "Henry \u2163", "Henry \u2163", "Henry \u2163" }, 71 72 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana) 73 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten 74 { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, // hw_ka + hw_ten 75 { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, // ka + hw_ten 76 { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, // hw_ka + ten 77 78 { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" }, 79 {"\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e","\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165", "\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165"}, 80 }; 81 82 String[][] compatTests = { 83 // Input Decomposed Composed 84 { "cat", "cat", "cat" }, 85 { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, // Alef-Lamed vs. Alef, Lamed 86 87 { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" }, 88 { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, // ffi ligature -> f + f + i 89 90 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated for 3.0 91 { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, // ffi ligature -> f + f + i 92 93 { "Henry IV", "Henry IV", "Henry IV" }, 94 { "Henry \u2163", "Henry IV", "Henry IV" }, 95 96 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana) 97 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten 98 99 { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, // hw_ka + ten 100 101 /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/ 102 { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, // hw_ka + hw_ten 103 { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, // ka + hw_ten 104 105 }; 106 107 // With Canonical decomposition, Hangul syllables should get decomposed 108 // into Jamo, but Jamo characters should not be decomposed into 109 // conjoining Jamo 110 String[][] hangulCanon = { 111 // Input Decomposed Composed 112 { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" }, 113 { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" }, 114 }; 115 116 // With compatibility decomposition turned on, 117 // it should go all the way down to conjoining Jamo characters. 118 // THIS IS NO LONGER TRUE IN UNICODE v2.1.8, SO THIS TEST IS OBSOLETE 119 String[][] hangulCompat = { 120 // Input Decomposed Composed 121 // { "\ud4db", "\u1111\u116e\u1175\u11af\u11c2", "\ud478\u1175\u11af\u11c2" }, 122 }; 123 124 @Test TestHangulCompose()125 public void TestHangulCompose() 126 throws Exception{ 127 // Make sure that the static composition methods work 128 logln("Canonical composition..."); 129 staticTest(Normalizer.NFC, hangulCanon, 2); 130 logln("Compatibility composition..."); 131 staticTest(Normalizer.NFKC, hangulCompat, 2); 132 // Now try iterative composition.... 133 logln("Iterative composition..."); 134 Normalizer norm = new Normalizer("", Normalizer.NFC,0); 135 iterateTest(norm, hangulCanon, 2); 136 137 norm.setMode(Normalizer.NFKD); 138 iterateTest(norm, hangulCompat, 2); 139 140 // And finally, make sure you can do it in reverse too 141 logln("Reverse iteration..."); 142 norm.setMode(Normalizer.NFC); 143 backAndForth(norm, hangulCanon); 144 } 145 146 @Test TestHangulDecomp()147 public void TestHangulDecomp() throws Exception{ 148 // Make sure that the static decomposition methods work 149 logln("Canonical decomposition..."); 150 staticTest(Normalizer.NFD, hangulCanon, 1); 151 logln("Compatibility decomposition..."); 152 staticTest(Normalizer.NFKD, hangulCompat, 1); 153 154 // Now the iterative decomposition methods... 155 logln("Iterative decomposition..."); 156 Normalizer norm = new Normalizer("", Normalizer.NFD,0); 157 iterateTest(norm, hangulCanon, 1); 158 159 norm.setMode(Normalizer.NFKD); 160 iterateTest(norm, hangulCompat, 1); 161 162 // And finally, make sure you can do it in reverse too 163 logln("Reverse iteration..."); 164 norm.setMode(Normalizer.NFD); 165 backAndForth(norm, hangulCanon); 166 } 167 @Test TestNone()168 public void TestNone() throws Exception{ 169 Normalizer norm = new Normalizer("", Normalizer.NONE,0); 170 iterateTest(norm, canonTests, 0); 171 staticTest(Normalizer.NONE, canonTests, 0); 172 } 173 @Test TestDecomp()174 public void TestDecomp() throws Exception{ 175 Normalizer norm = new Normalizer("", Normalizer.NFD,0); 176 iterateTest(norm, canonTests, 1); 177 staticTest(Normalizer.NFD, canonTests, 1); 178 decomposeTest(Normalizer.NFD, canonTests, 1); 179 } 180 181 @Test TestCompatDecomp()182 public void TestCompatDecomp() throws Exception{ 183 Normalizer norm = new Normalizer("", Normalizer.NFKD,0); 184 iterateTest(norm, compatTests, 1); 185 staticTest(Normalizer.NFKD,compatTests, 1); 186 decomposeTest(Normalizer.NFKD,compatTests, 1); 187 } 188 189 @Test TestCanonCompose()190 public void TestCanonCompose() throws Exception{ 191 Normalizer norm = new Normalizer("", Normalizer.NFC,0); 192 staticTest(Normalizer.NFC, canonTests, 2); 193 iterateTest(norm, canonTests, 2); 194 composeTest(Normalizer.NFC, canonTests, 2); 195 } 196 197 @Test TestCompatCompose()198 public void TestCompatCompose() throws Exception{ 199 Normalizer norm = new Normalizer("", Normalizer.NFKC,0); 200 iterateTest(norm, compatTests, 2); 201 staticTest(Normalizer.NFKC,compatTests, 2); 202 composeTest(Normalizer.NFKC,compatTests, 2); 203 } 204 205 @Test TestExplodingBase()206 public void TestExplodingBase() throws Exception{ 207 // \u017f - Latin small letter long s 208 // \u0307 - combining dot above 209 // \u1e61 - Latin small letter s with dot above 210 // \u1e9b - Latin small letter long s with dot above 211 String[][] canon = { 212 // Input Decomposed Composed 213 { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" }, 214 { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" }, 215 }; 216 String[][] compat = { 217 // Input Decomposed Composed 218 { "\u017f", "s", "s" }, 219 { "\u1e9b", "s\u0307", "\u1e61" }, 220 }; 221 222 staticTest(Normalizer.NFD, canon, 1); 223 staticTest(Normalizer.NFC, canon, 2); 224 225 staticTest(Normalizer.NFKD, compat, 1); 226 staticTest(Normalizer.NFKC, compat, 2); 227 228 } 229 230 /** 231 * The Tibetan vowel sign AA, 0f71, was messed up prior to 232 * Unicode version 2.1.9. 233 * Once 2.1.9 or 3.0 is released, uncomment this test. 234 */ 235 @Test TestTibetan()236 public void TestTibetan() throws Exception{ 237 String[][] decomp = { 238 { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" } 239 }; 240 String[][] compose = { 241 { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" } 242 }; 243 244 staticTest(Normalizer.NFD, decomp, 1); 245 staticTest(Normalizer.NFKD,decomp, 2); 246 staticTest(Normalizer.NFC, compose, 1); 247 staticTest(Normalizer.NFKC,compose, 2); 248 } 249 250 /** 251 * Make sure characters in the CompositionExclusion.txt list do not get 252 * composed to. 253 */ 254 @Test TestCompositionExclusion()255 public void TestCompositionExclusion() 256 throws Exception{ 257 // This list is generated from CompositionExclusion.txt. 258 // Update whenever the normalizer tables are updated. Note 259 // that we test all characters listed, even those that can be 260 // derived from the Unicode DB and are therefore commented 261 // out. 262 String EXCLUDED = 263 "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" + 264 "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" + 265 "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" + 266 "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" + 267 "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" + 268 "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" + 269 "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB" + 270 "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" + 271 "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" + 272 "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" + 273 "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" + 274 "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" + 275 "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" + 276 "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E"; 277 for (int i=0; i<EXCLUDED.length(); ++i) { 278 String a = String.valueOf(EXCLUDED.charAt(i)); 279 String b = Normalizer.normalize(a, Normalizer.NFKD); 280 String c = Normalizer.normalize(b, Normalizer.NFC); 281 if (c.equals(a)) { 282 errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " + 283 hex(b) + " x COMPOSE => " + 284 hex(c)); 285 } else if (isVerbose()) { 286 logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " + 287 hex(b) + " x COMPOSE => " + 288 hex(c)); 289 } 290 } 291 // The following method works too, but it is somewhat 292 // incestuous. It uses UInfo, which is the same database that 293 // NormalizerBuilder uses, so if something is wrong with 294 // UInfo, the following test won't show it. All it will show 295 // is that NormalizerBuilder has been run with whatever the 296 // current UInfo is. 297 // 298 // We comment this out in favor of the test above, which 299 // provides independent verification (but also requires 300 // independent updating). 301 // logln("---"); 302 // UInfo uinfo = new UInfo(); 303 // for (int i=0; i<=0xFFFF; ++i) { 304 // if (!uinfo.isExcludedComposition((char)i) || 305 // (!uinfo.hasCanonicalDecomposition((char)i) && 306 // !uinfo.hasCompatibilityDecomposition((char)i))) continue; 307 // String a = String.valueOf((char)i); 308 // String b = Normalizer.normalize(a,Normalizer.DECOMP_COMPAT,0); 309 // String c = Normalizer.normalize(b,Normalizer.COMPOSE,0); 310 // if (c.equals(a)) { 311 // errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " + 312 // hex(b) + " x COMPOSE => " + 313 // hex(c)); 314 // } else if (isVerbose()) { 315 // logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " + 316 // hex(b) + " x COMPOSE => " + 317 // hex(c)); 318 // } 319 // } 320 } 321 322 /** 323 * Test for a problem that showed up just before ICU 1.6 release 324 * having to do with combining characters with an index of zero. 325 * Such characters do not participate in any canonical 326 * decompositions. However, having an index of zero means that 327 * they all share one typeMask[] entry, that is, they all have to 328 * map to the same canonical class, which is not the case, in 329 * reality. 330 */ 331 @Test TestZeroIndex()332 public void TestZeroIndex() 333 throws Exception{ 334 String[] DATA = { 335 // Expect col1 x COMPOSE_COMPAT => col2 336 // Expect col2 x DECOMP => col3 337 "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300", 338 "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300", 339 "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300", 340 "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327", 341 "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321", 342 }; 343 344 for (int i=0; i<DATA.length; i+=3) { 345 String a = DATA[i]; 346 String b = Normalizer.normalize(a, Normalizer.NFKC); 347 String exp = DATA[i+1]; 348 if (b.equals(exp)) { 349 logln("Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b)); 350 } else { 351 errln("FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) + 352 ", expect " + hex(exp)); 353 } 354 a = Normalizer.normalize(b, Normalizer.NFD); 355 exp = DATA[i+2]; 356 if (a.equals(exp)) { 357 logln("Ok: " + hex(b) + " x DECOMP => " + hex(a)); 358 } else { 359 errln("FAIL: " + hex(b) + " x DECOMP => " + hex(a) + 360 ", expect " + hex(exp)); 361 } 362 } 363 } 364 365 /** 366 * Test for a problem found by Verisign. Problem is that 367 * characters at the start of a string are not put in canonical 368 * order correctly by compose() if there is no starter. 369 */ 370 @Test TestVerisign()371 public void TestVerisign() 372 throws Exception{ 373 String[] inputs = { 374 "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f", 375 "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad" 376 }; 377 String[] outputs = { 378 "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f", 379 "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4" 380 }; 381 382 for (int i = 0; i < inputs.length; ++i) { 383 String input = inputs[i]; 384 String output = outputs[i]; 385 String result = Normalizer.decompose(input, false); 386 if (!result.equals(output)) { 387 errln("FAIL input: " + hex(input)); 388 errln(" decompose: " + hex(result)); 389 errln(" expected: " + hex(output)); 390 } 391 result = Normalizer.compose(input, false); 392 if (!result.equals(output)) { 393 errln("FAIL input: " + hex(input)); 394 errln(" compose: " + hex(result)); 395 errln(" expected: " + hex(output)); 396 } 397 } 398 399 } 400 @Test TestQuickCheckResultNO()401 public void TestQuickCheckResultNO() 402 throws Exception{ 403 final char CPNFD[] = {0x00C5, 0x0407, 0x1E00, 0x1F57, 0x220C, 404 0x30AE, 0xAC00, 0xD7A3, 0xFB36, 0xFB4E}; 405 final char CPNFC[] = {0x0340, 0x0F93, 0x1F77, 0x1FBB, 0x1FEB, 406 0x2000, 0x232A, 0xF900, 0xFA1E, 0xFB4E}; 407 final char CPNFKD[] = {0x00A0, 0x02E4, 0x1FDB, 0x24EA, 0x32FE, 408 0xAC00, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D}; 409 final char CPNFKC[] = {0x00A0, 0x017F, 0x2000, 0x24EA, 0x32FE, 410 0x33FE, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D}; 411 412 413 final int SIZE = 10; 414 415 int count = 0; 416 for (; count < SIZE; count ++) 417 { 418 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]), 419 Normalizer.NFD,0) != Normalizer.NO) 420 { 421 errln("ERROR in NFD quick check at U+" + 422 Integer.toHexString(CPNFD[count])); 423 return; 424 } 425 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 426 Normalizer.NFC,0) !=Normalizer.NO) 427 { 428 errln("ERROR in NFC quick check at U+"+ 429 Integer.toHexString(CPNFC[count])); 430 return; 431 } 432 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]), 433 Normalizer.NFKD,0) != Normalizer.NO) 434 { 435 errln("ERROR in NFKD quick check at U+"+ 436 Integer.toHexString(CPNFKD[count])); 437 return; 438 } 439 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 440 Normalizer.NFKC,0) !=Normalizer.NO) 441 { 442 errln("ERROR in NFKC quick check at U+"+ 443 Integer.toHexString(CPNFKC[count])); 444 return; 445 } 446 // for improving coverage 447 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 448 Normalizer.NFKC) !=Normalizer.NO) 449 { 450 errln("ERROR in NFKC quick check at U+"+ 451 Integer.toHexString(CPNFKC[count])); 452 return; 453 } 454 } 455 } 456 457 458 @Test TestQuickCheckResultYES()459 public void TestQuickCheckResultYES() 460 throws Exception{ 461 final char CPNFD[] = {0x00C6, 0x017F, 0x0F74, 0x1000, 0x1E9A, 462 0x2261, 0x3075, 0x4000, 0x5000, 0xF000}; 463 final char CPNFC[] = {0x0400, 0x0540, 0x0901, 0x1000, 0x1500, 464 0x1E9A, 0x3000, 0x4000, 0x5000, 0xF000}; 465 final char CPNFKD[] = {0x00AB, 0x02A0, 0x1000, 0x1027, 0x2FFB, 466 0x3FFF, 0x4FFF, 0xA000, 0xF000, 0xFA27}; 467 final char CPNFKC[] = {0x00B0, 0x0100, 0x0200, 0x0A02, 0x1000, 468 0x2010, 0x3030, 0x4000, 0xA000, 0xFA0E}; 469 470 final int SIZE = 10; 471 int count = 0; 472 473 char cp = 0; 474 while (cp < 0xA0) 475 { 476 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFD,0) 477 != Normalizer.YES) 478 { 479 errln("ERROR in NFD quick check at U+"+ 480 Integer.toHexString(cp)); 481 return; 482 } 483 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFC,0) 484 != Normalizer.YES) 485 { 486 errln("ERROR in NFC quick check at U+"+ 487 Integer.toHexString(cp)); 488 return; 489 } 490 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKD,0) 491 != Normalizer.YES) 492 { 493 errln("ERROR in NFKD quick check at U+" + 494 Integer.toHexString(cp)); 495 return; 496 } 497 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC,0) 498 != Normalizer.YES) 499 { 500 errln("ERROR in NFKC quick check at U+"+ 501 Integer.toHexString(cp)); 502 return; 503 } 504 // improve the coverage 505 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC) 506 != Normalizer.YES) 507 { 508 errln("ERROR in NFKC quick check at U+"+ 509 Integer.toHexString(cp)); 510 return; 511 } 512 cp++; 513 } 514 515 for (; count < SIZE; count ++) 516 { 517 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]), 518 Normalizer.NFD,0)!=Normalizer.YES) 519 { 520 errln("ERROR in NFD quick check at U+"+ 521 Integer.toHexString(CPNFD[count])); 522 return; 523 } 524 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 525 Normalizer.NFC,0)!=Normalizer.YES) 526 { 527 errln("ERROR in NFC quick check at U+"+ 528 Integer.toHexString(CPNFC[count])); 529 return; 530 } 531 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]), 532 Normalizer.NFKD,0)!=Normalizer.YES) 533 { 534 errln("ERROR in NFKD quick check at U+"+ 535 Integer.toHexString(CPNFKD[count])); 536 return; 537 } 538 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 539 Normalizer.NFKC,0)!=Normalizer.YES) 540 { 541 errln("ERROR in NFKC quick check at U+"+ 542 Integer.toHexString(CPNFKC[count])); 543 return; 544 } 545 // improve the coverage 546 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 547 Normalizer.NFKC)!=Normalizer.YES) 548 { 549 errln("ERROR in NFKC quick check at U+"+ 550 Integer.toHexString(CPNFKC[count])); 551 return; 552 } 553 } 554 } 555 @Test TestBengali()556 public void TestBengali() throws Exception{ 557 String input = "\u09bc\u09be\u09cd\u09be"; 558 String output=Normalizer.normalize(input,Normalizer.NFC); 559 if(!input.equals(output)){ 560 errln("ERROR in NFC of string"); 561 } 562 } 563 @Test TestQuickCheckResultMAYBE()564 public void TestQuickCheckResultMAYBE() 565 throws Exception{ 566 567 final char[] CPNFC = {0x0306, 0x0654, 0x0BBE, 0x102E, 0x1161, 568 0x116A, 0x1173, 0x1175, 0x3099, 0x309A}; 569 final char[] CPNFKC = {0x0300, 0x0654, 0x0655, 0x09D7, 0x0B3E, 570 0x0DCF, 0xDDF, 0x102E, 0x11A8, 0x3099}; 571 572 573 final int SIZE = 10; 574 575 int count = 0; 576 577 /* NFD and NFKD does not have any MAYBE codepoints */ 578 for (; count < SIZE; count ++) 579 { 580 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 581 Normalizer.NFC,0)!=Normalizer.MAYBE) 582 { 583 errln("ERROR in NFC quick check at U+"+ 584 Integer.toHexString(CPNFC[count])); 585 return; 586 } 587 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 588 Normalizer.NFKC,0)!=Normalizer.MAYBE) 589 { 590 errln("ERROR in NFKC quick check at U+"+ 591 Integer.toHexString(CPNFKC[count])); 592 return; 593 } 594 if (Normalizer.quickCheck(new char[]{CPNFC[count]}, 595 Normalizer.NFC,0)!=Normalizer.MAYBE) 596 { 597 errln("ERROR in NFC quick check at U+"+ 598 Integer.toHexString(CPNFC[count])); 599 return; 600 } 601 if (Normalizer.quickCheck(new char[]{CPNFKC[count]}, 602 Normalizer.NFKC,0)!=Normalizer.MAYBE) 603 { 604 errln("ERROR in NFKC quick check at U+"+ 605 Integer.toHexString(CPNFKC[count])); 606 return; 607 } 608 if (Normalizer.quickCheck(new char[]{CPNFKC[count]}, 609 Normalizer.NONE,0)!=Normalizer.YES) 610 { 611 errln("ERROR in NONE quick check at U+"+ 612 Integer.toHexString(CPNFKC[count])); 613 return; 614 } 615 } 616 } 617 618 @Test TestQuickCheckStringResult()619 public void TestQuickCheckStringResult() 620 throws Exception{ 621 int count; 622 String d; 623 String c; 624 625 for (count = 0; count < canonTests.length; count ++) 626 { 627 d = canonTests[count][1]; 628 c = canonTests[count][2]; 629 if (Normalizer.quickCheck(d,Normalizer.NFD,0) 630 != Normalizer.YES) 631 { 632 errln("ERROR in NFD quick check for string at count " + count); 633 return; 634 } 635 636 if (Normalizer.quickCheck(c, Normalizer.NFC,0) 637 == Normalizer.NO) 638 { 639 errln("ERROR in NFC quick check for string at count " + count); 640 return; 641 } 642 } 643 644 for (count = 0; count < compatTests.length; count ++) 645 { 646 d = compatTests[count][1]; 647 c = compatTests[count][2]; 648 if (Normalizer.quickCheck(d, Normalizer.NFKD,0) 649 != Normalizer.YES) 650 { 651 errln("ERROR in NFKD quick check for string at count " + count); 652 return; 653 } 654 655 if (Normalizer.quickCheck(c, Normalizer.NFKC,0) 656 != Normalizer.YES) 657 { 658 errln("ERROR in NFKC quick check for string at count " + count); 659 return; 660 } 661 } 662 } 663 qcToInt(Normalizer.QuickCheckResult qc)664 static final int qcToInt(Normalizer.QuickCheckResult qc) { 665 if(qc==Normalizer.NO) { 666 return 0; 667 } else if(qc==Normalizer.YES) { 668 return 1; 669 } else /* Normalizer.MAYBE */ { 670 return 2; 671 } 672 } 673 674 @Test TestQuickCheckPerCP()675 public void TestQuickCheckPerCP() { 676 int c, lead, trail; 677 String s, nfd; 678 int lccc1, lccc2, tccc1, tccc2; 679 int qc1, qc2; 680 681 if( 682 UCharacter.getIntPropertyMaxValue(UProperty.NFD_QUICK_CHECK)!=1 || // YES 683 UCharacter.getIntPropertyMaxValue(UProperty.NFKD_QUICK_CHECK)!=1 || 684 UCharacter.getIntPropertyMaxValue(UProperty.NFC_QUICK_CHECK)!=2 || // MAYBE 685 UCharacter.getIntPropertyMaxValue(UProperty.NFKC_QUICK_CHECK)!=2 || 686 UCharacter.getIntPropertyMaxValue(UProperty.LEAD_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) || 687 UCharacter.getIntPropertyMaxValue(UProperty.TRAIL_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) 688 ) { 689 errln("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK) or UCHAR_*_CANONICAL_COMBINING_CLASS"); 690 } 691 692 /* 693 * compare the quick check property values for some code points 694 * to the quick check results for checking same-code point strings 695 */ 696 c=0; 697 while(c<0x110000) { 698 s=UTF16.valueOf(c); 699 700 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFC_QUICK_CHECK); 701 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFC)); 702 if(qc1!=qc2) { 703 errln("getIntPropertyValue(NFC)="+qc1+" != "+qc2+"=quickCheck(NFC) for U+"+Integer.toHexString(c)); 704 } 705 706 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFD_QUICK_CHECK); 707 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFD)); 708 if(qc1!=qc2) { 709 errln("getIntPropertyValue(NFD)="+qc1+" != "+qc2+"=quickCheck(NFD) for U+"+Integer.toHexString(c)); 710 } 711 712 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKC_QUICK_CHECK); 713 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKC)); 714 if(qc1!=qc2) { 715 errln("getIntPropertyValue(NFKC)="+qc1+" != "+qc2+"=quickCheck(NFKC) for U+"+Integer.toHexString(c)); 716 } 717 718 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKD_QUICK_CHECK); 719 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKD)); 720 if(qc1!=qc2) { 721 errln("getIntPropertyValue(NFKD)="+qc1+" != "+qc2+"=quickCheck(NFKD) for U+"+Integer.toHexString(c)); 722 } 723 724 nfd=Normalizer.normalize(s, Normalizer.NFD); 725 lead=UTF16.charAt(nfd, 0); 726 trail=UTF16.charAt(nfd, nfd.length()-1); 727 728 lccc1=UCharacter.getIntPropertyValue(c, UProperty.LEAD_CANONICAL_COMBINING_CLASS); 729 lccc2=UCharacter.getCombiningClass(lead); 730 tccc1=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS); 731 tccc2=UCharacter.getCombiningClass(trail); 732 733 if(lccc1!=lccc2) { 734 errln("getIntPropertyValue(lccc)="+lccc1+" != "+lccc2+"=getCombiningClass(lead) for U+"+Integer.toHexString(c)); 735 } 736 if(tccc1!=tccc2) { 737 errln("getIntPropertyValue(tccc)="+tccc1+" != "+tccc2+"=getCombiningClass(trail) for U+"+Integer.toHexString(c)); 738 } 739 740 /* skip some code points */ 741 c=(20*c)/19+1; 742 } 743 } 744 745 //------------------------------------------------------------------------ 746 // Internal utilities 747 // 748 //------------------------------------------------------------------------ 749 // Internal utilities 750 // 751 752 /* private void backAndForth(Normalizer iter, String input) 753 { 754 iter.setText(input); 755 756 // Run through the iterator forwards and stick it into a StringBuffer 757 StringBuffer forward = new StringBuffer(); 758 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) { 759 forward.append(ch); 760 } 761 762 // Now do it backwards 763 StringBuffer reverse = new StringBuffer(); 764 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) { 765 reverse.insert(0, ch); 766 } 767 768 if (!forward.toString().equals(reverse.toString())) { 769 errln("FAIL: Forward/reverse mismatch for input " + hex(input) 770 + ", forward: " + hex(forward) + ", backward: "+hex(reverse)); 771 } else if (isVerbose()) { 772 logln("Ok: Forward/reverse for input " + hex(input) 773 + ", forward: " + hex(forward) + ", backward: "+hex(reverse)); 774 } 775 }*/ 776 backAndForth(Normalizer iter, String[][] tests)777 private void backAndForth(Normalizer iter, String[][] tests) 778 { 779 for (int i = 0; i < tests.length; i++) 780 { 781 iter.setText(tests[i][0]); 782 783 // Run through the iterator forwards and stick it into a 784 // StringBuffer 785 StringBuffer forward = new StringBuffer(); 786 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) { 787 forward.append(ch); 788 } 789 790 // Now do it backwards 791 StringBuffer reverse = new StringBuffer(); 792 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) { 793 reverse.insert(0, ch); 794 } 795 796 if (!forward.toString().equals(reverse.toString())) { 797 errln("FAIL: Forward/reverse mismatch for input " 798 + hex(tests[i][0]) + ", forward: " + hex(forward) 799 + ", backward: " + hex(reverse)); 800 } else if (isVerbose()) { 801 logln("Ok: Forward/reverse for input " + hex(tests[i][0]) 802 + ", forward: " + hex(forward) + ", backward: " 803 + hex(reverse)); 804 } 805 } 806 } 807 staticTest(Normalizer.Mode mode, String[][] tests, int outCol)808 private void staticTest (Normalizer.Mode mode, 809 String[][] tests, int outCol) throws Exception{ 810 for (int i = 0; i < tests.length; i++) 811 { 812 String input = Utility.unescape(tests[i][0]); 813 String expect = Utility.unescape(tests[i][outCol]); 814 815 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 816 817 String output = Normalizer.normalize(input, mode); 818 819 if (!output.equals(expect)) { 820 errln("FAIL: case " + i 821 + " expected '" + expect + "' (" + hex(expect) + ")" 822 + " but got '" + output + "' (" + hex(output) + ")" ); 823 } 824 } 825 char[] output = new char[1]; 826 for (int i = 0; i < tests.length; i++) 827 { 828 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 829 String expect =Utility.unescape( tests[i][outCol]); 830 831 logln("Normalizing '" + new String(input) + "' (" + 832 hex(new String(input)) + ")" ); 833 int reqLength=0; 834 while(true){ 835 try{ 836 reqLength=Normalizer.normalize(input,output, mode,0); 837 if(reqLength<=output.length ){ 838 break; 839 } 840 }catch(IndexOutOfBoundsException e){ 841 output= new char[Integer.parseInt(e.getMessage())]; 842 continue; 843 } 844 } 845 if (!expect.equals(new String(output,0,reqLength))) { 846 errln("FAIL: case " + i 847 + " expected '" + expect + "' (" + hex(expect) + ")" 848 + " but got '" + new String(output) 849 + "' (" + hex(new String(output)) + ")" ); 850 } 851 } 852 } decomposeTest(Normalizer.Mode mode, String[][] tests, int outCol)853 private void decomposeTest(Normalizer.Mode mode, 854 String[][] tests, int outCol) throws Exception{ 855 for (int i = 0; i < tests.length; i++) 856 { 857 String input = Utility.unescape(tests[i][0]); 858 String expect = Utility.unescape(tests[i][outCol]); 859 860 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 861 862 String output = Normalizer.decompose(input, mode==Normalizer.NFKD); 863 864 if (!output.equals(expect)) { 865 errln("FAIL: case " + i 866 + " expected '" + expect + "' (" + hex(expect) + ")" 867 + " but got '" + output + "' (" + hex(output) + ")" ); 868 } 869 } 870 char[] output = new char[1]; 871 for (int i = 0; i < tests.length; i++) 872 { 873 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 874 String expect = Utility.unescape(tests[i][outCol]); 875 876 logln("Normalizing '" + new String(input) + "' (" + 877 hex(new String(input)) + ")" ); 878 int reqLength=0; 879 while(true){ 880 try{ 881 reqLength=Normalizer.decompose(input,output, mode==Normalizer.NFKD,0); 882 if(reqLength<=output.length ){ 883 break; 884 } 885 }catch(IndexOutOfBoundsException e){ 886 output= new char[Integer.parseInt(e.getMessage())]; 887 continue; 888 } 889 } 890 if (!expect.equals(new String(output,0,reqLength))) { 891 errln("FAIL: case " + i 892 + " expected '" + expect + "' (" + hex(expect) + ")" 893 + " but got '" + new String(output) 894 + "' (" + hex(new String(output)) + ")" ); 895 } 896 } 897 output = new char[1]; 898 for (int i = 0; i < tests.length; i++) 899 { 900 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 901 String expect = Utility.unescape(tests[i][outCol]); 902 903 logln("Normalizing '" + new String(input) + "' (" + 904 hex(new String(input)) + ")" ); 905 int reqLength=0; 906 while(true){ 907 try{ 908 reqLength=Normalizer.decompose(input,0,input.length,output,0,output.length, mode==Normalizer.NFKD,0); 909 if(reqLength<=output.length ){ 910 break; 911 } 912 }catch(IndexOutOfBoundsException e){ 913 output= new char[Integer.parseInt(e.getMessage())]; 914 continue; 915 } 916 } 917 if (!expect.equals(new String(output,0,reqLength))) { 918 errln("FAIL: case " + i 919 + " expected '" + expect + "' (" + hex(expect) + ")" 920 + " but got '" + new String(output) 921 + "' (" + hex(new String(output)) + ")" ); 922 } 923 char[] output2 = new char[reqLength * 2]; 924 System.arraycopy(output, 0, output2, 0, reqLength); 925 int retLength = Normalizer.decompose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0); 926 if(retLength != reqLength){ 927 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength); 928 } 929 } 930 } 931 composeTest(Normalizer.Mode mode, String[][] tests, int outCol)932 private void composeTest(Normalizer.Mode mode, 933 String[][] tests, int outCol) throws Exception{ 934 for (int i = 0; i < tests.length; i++) 935 { 936 String input = Utility.unescape(tests[i][0]); 937 String expect = Utility.unescape(tests[i][outCol]); 938 939 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 940 941 String output = Normalizer.compose(input, mode==Normalizer.NFKC); 942 943 if (!output.equals(expect)) { 944 errln("FAIL: case " + i 945 + " expected '" + expect + "' (" + hex(expect) + ")" 946 + " but got '" + output + "' (" + hex(output) + ")" ); 947 } 948 } 949 char[] output = new char[1]; 950 for (int i = 0; i < tests.length; i++) 951 { 952 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 953 String expect = Utility.unescape(tests[i][outCol]); 954 955 logln("Normalizing '" + new String(input) + "' (" + 956 hex(new String(input)) + ")" ); 957 int reqLength=0; 958 while(true){ 959 try{ 960 reqLength=Normalizer.compose(input,output, mode==Normalizer.NFKC,0); 961 if(reqLength<=output.length ){ 962 break; 963 } 964 }catch(IndexOutOfBoundsException e){ 965 output= new char[Integer.parseInt(e.getMessage())]; 966 continue; 967 } 968 } 969 if (!expect.equals(new String(output,0,reqLength))) { 970 errln("FAIL: case " + i 971 + " expected '" + expect + "' (" + hex(expect) + ")" 972 + " but got '" + new String(output) 973 + "' (" + hex(new String(output)) + ")" ); 974 } 975 } 976 output = new char[1]; 977 for (int i = 0; i < tests.length; i++) 978 { 979 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 980 String expect = Utility.unescape(tests[i][outCol]); 981 982 logln("Normalizing '" + new String(input) + "' (" + 983 hex(new String(input)) + ")" ); 984 int reqLength=0; 985 while(true){ 986 try{ 987 reqLength=Normalizer.compose(input,0,input.length, output, 0, output.length, mode==Normalizer.NFKC,0); 988 if(reqLength<=output.length ){ 989 break; 990 } 991 }catch(IndexOutOfBoundsException e){ 992 output= new char[Integer.parseInt(e.getMessage())]; 993 continue; 994 } 995 } 996 if (!expect.equals(new String(output,0,reqLength))) { 997 errln("FAIL: case " + i 998 + " expected '" + expect + "' (" + hex(expect) + ")" 999 + " but got '" + new String(output) 1000 + "' (" + hex(new String(output)) + ")" ); 1001 } 1002 1003 char[] output2 = new char[reqLength * 2]; 1004 System.arraycopy(output, 0, output2, 0, reqLength); 1005 int retLength = Normalizer.compose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0); 1006 if(retLength != reqLength){ 1007 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength); 1008 } 1009 } 1010 } iterateTest(Normalizer iter, String[][] tests, int outCol)1011 private void iterateTest(Normalizer iter, String[][] tests, int outCol){ 1012 for (int i = 0; i < tests.length; i++) 1013 { 1014 String input = Utility.unescape(tests[i][0]); 1015 String expect = Utility.unescape(tests[i][outCol]); 1016 1017 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 1018 1019 iter.setText(input); 1020 assertEqual(expect, iter, "case " + i + " "); 1021 } 1022 } 1023 assertEqual(String expected, Normalizer iter, String msg)1024 private void assertEqual(String expected, Normalizer iter, String msg) 1025 { 1026 int index = 0; 1027 int ch; 1028 UCharacterIterator cIter = UCharacterIterator.getInstance(expected); 1029 1030 while ((ch=iter.next())!= Normalizer.DONE){ 1031 if (index >= expected.length()) { 1032 errln("FAIL: " + msg + "Unexpected character '" + (char)ch 1033 + "' (" + hex(ch) + ")" 1034 + " at index " + index); 1035 break; 1036 } 1037 int want = UTF16.charAt(expected,index); 1038 if (ch != want) { 1039 errln("FAIL: " + msg + "got '" + (char)ch 1040 + "' (" + hex(ch) + ")" 1041 + " but expected '" + want + "' (" + hex(want)+ ")" 1042 + " at index " + index); 1043 } 1044 index+= UTF16.getCharCount(ch); 1045 } 1046 if (index < expected.length()) { 1047 errln("FAIL: " + msg + "Only got " + index + " chars, expected " 1048 + expected.length()); 1049 } 1050 1051 cIter.setToLimit(); 1052 while((ch=iter.previous())!=Normalizer.DONE){ 1053 int want = cIter.previousCodePoint(); 1054 if (ch != want ) { 1055 errln("FAIL: " + msg + "got '" + (char)ch 1056 + "' (" + hex(ch) + ")" 1057 + " but expected '" + want + "' (" + hex(want) + ")" 1058 + " at index " + index); 1059 } 1060 } 1061 } 1062 //-------------------------------------------------------------------------- 1063 1064 // NOTE: These tests are used for quick debugging so are not ported 1065 // to ICU4C tsnorm.cpp in intltest 1066 // 1067 1068 @Test TestDebugStatic()1069 public void TestDebugStatic(){ 1070 String in = Utility.unescape("\\U0001D157\\U0001D165"); 1071 if(!Normalizer.isNormalized(in,Normalizer.NFC,0)){ 1072 errln("isNormalized failed"); 1073 } 1074 1075 String input = "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1076 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1077 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1078 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1079 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1080 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1081 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1082 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1083 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1084 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1085 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1086 "d\u031B\u0307\u0323"; 1087 String expect = "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+ 1088 "\u11AA\u1100\u116F\u11AA\uD834\uDD57\uD834\uDD65"+ 1089 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1090 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1091 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1092 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1093 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1094 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1095 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1096 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1097 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1098 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1099 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1100 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1101 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1102 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1103 "\uD834\uDD57\uD834\uDD65aaaaaaaaaaaaaaaaaazzzzzz"+ 1104 "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1105 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1106 "bbbbbbbbbbbbbbbbbbbbbbbbccccccccccccccccccccccccccccc"+ 1107 "cccccccccccccccccccccccccccccccccccccccccccccccc"+ 1108 "ddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1109 "dddddddddddddddddddddddd"+ 1110 "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+ 1111 "\u11AA\u1100\u116F\u11AA\u0064\u031B\u0323\u0307"; 1112 String output = Normalizer.normalize(Utility.unescape(input), 1113 Normalizer.NFD); 1114 if(!expect.equals(output)){ 1115 errln("FAIL expected: "+hex(expect) + " got: "+hex(output)); 1116 } 1117 1118 1119 1120 } 1121 @Test TestDebugIter()1122 public void TestDebugIter(){ 1123 String src = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e"); 1124 String expected = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e"); 1125 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(src)), 1126 Normalizer.NONE,0); 1127 int index = 0; 1128 int ch; 1129 UCharacterIterator cIter = UCharacterIterator.getInstance(expected); 1130 1131 while ((ch=iter.next())!= Normalizer.DONE){ 1132 if (index >= expected.length()) { 1133 errln("FAIL: " + "Unexpected character '" + (char)ch 1134 + "' (" + hex(ch) + ")" 1135 + " at index " + index); 1136 break; 1137 } 1138 int want = UTF16.charAt(expected,index); 1139 if (ch != want) { 1140 errln("FAIL: " + "got '" + (char)ch 1141 + "' (" + hex(ch) + ")" 1142 + " but expected '" + want + "' (" + hex(want)+ ")" 1143 + " at index " + index); 1144 } 1145 index+= UTF16.getCharCount(ch); 1146 } 1147 if (index < expected.length()) { 1148 errln("FAIL: " + "Only got " + index + " chars, expected " 1149 + expected.length()); 1150 } 1151 1152 cIter.setToLimit(); 1153 while((ch=iter.previous())!=Normalizer.DONE){ 1154 int want = cIter.previousCodePoint(); 1155 if (ch != want ) { 1156 errln("FAIL: " + "got '" + (char)ch 1157 + "' (" + hex(ch) + ")" 1158 + " but expected '" + want + "' (" + hex(want) + ")" 1159 + " at index " + index); 1160 } 1161 } 1162 } 1163 @Test TestDebugIterOld()1164 public void TestDebugIterOld(){ 1165 String input = "\\U0001D15E"; 1166 String expected = "\uD834\uDD57\uD834\uDD65"; 1167 String expectedReverse = "\uD834\uDD65\uD834\uDD57"; 1168 int index = 0; 1169 int ch; 1170 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(input)), 1171 Normalizer.NFKC,0); 1172 StringBuffer got = new StringBuffer(); 1173 for (ch = iter.first();ch!=Normalizer.DONE;ch=iter.next()) 1174 { 1175 if (index >= expected.length()) { 1176 errln("FAIL: " + "Unexpected character '" + (char)ch + 1177 "' (" + hex(ch) + ")" + " at index " + index); 1178 break; 1179 } 1180 got.append(UCharacter.toString(ch)); 1181 index++; 1182 } 1183 if (!expected.equals(got.toString())) { 1184 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")" 1185 + " but expected '" + expected + "' (" 1186 + hex(expected) + ")"); 1187 } 1188 if (got.length() < expected.length()) { 1189 errln("FAIL: " + "Only got " + index + " chars, expected " 1190 + expected.length()); 1191 } 1192 1193 logln("Reverse Iteration\n"); 1194 iter.setIndexOnly(iter.endIndex()); 1195 got.setLength(0); 1196 for(ch=iter.previous();ch!=Normalizer.DONE;ch=iter.previous()){ 1197 if (index >= expected.length()) { 1198 errln("FAIL: " + "Unexpected character '" + (char)ch 1199 + "' (" + hex(ch) + ")" + " at index " + index); 1200 break; 1201 } 1202 got.append(UCharacter.toString(ch)); 1203 } 1204 if (!expectedReverse.equals(got.toString())) { 1205 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")" 1206 + " but expected '" + expected 1207 + "' (" + hex(expected) + ")"); 1208 } 1209 if (got.length() < expected.length()) { 1210 errln("FAIL: " + "Only got " + index + " chars, expected " 1211 + expected.length()); 1212 } 1213 1214 } 1215 //-------------------------------------------------------------------------- 1216 // helper class for TestPreviousNext() 1217 // simple UTF-32 character iterator 1218 class UCharIterator { 1219 UCharIterator(int[] src, int len, int index)1220 public UCharIterator(int[] src, int len, int index){ 1221 1222 s=src; 1223 length=len; 1224 i=index; 1225 } 1226 current()1227 public int current() { 1228 if(i<length) { 1229 return s[i]; 1230 } else { 1231 return -1; 1232 } 1233 } 1234 next()1235 public int next() { 1236 if(i<length) { 1237 return s[i++]; 1238 } else { 1239 return -1; 1240 } 1241 } 1242 previous()1243 public int previous() { 1244 if(i>0) { 1245 return s[--i]; 1246 } else { 1247 return -1; 1248 } 1249 } 1250 getIndex()1251 public int getIndex() { 1252 return i; 1253 } 1254 1255 private int[] s; 1256 private int length, i; 1257 } 1258 @Test TestPreviousNext()1259 public void TestPreviousNext() { 1260 // src and expect strings 1261 char src[]={ 1262 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999), 1263 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f), 1264 0xc4, 1265 0x1ed0 1266 }; 1267 int expect[]={ 1268 0x831d, 1269 0x1d158, 0x1d165, 1270 0x41, 0x308, 1271 0x4f, 0x302, 0x301 1272 }; 1273 1274 // expected src indexes corresponding to expect indexes 1275 int expectIndex[]={ 1276 0, 1277 2, 2, 1278 4, 4, 1279 5, 5, 5, 1280 6 // behind last character 1281 }; 1282 1283 // initial indexes into the src and expect strings 1284 1285 final int SRC_MIDDLE=4; 1286 final int EXPECT_MIDDLE=3; 1287 1288 1289 // movement vector 1290 // - for previous(), 0 for current(), + for next() 1291 // not const so that we can terminate it below for the error message 1292 String moves="0+0+0--0-0-+++0--+++++++0--------"; 1293 1294 // iterators 1295 Normalizer iter = new Normalizer(new String(src), 1296 Normalizer.NFD,0); 1297 UCharIterator iter32 = new UCharIterator(expect, expect.length, 1298 EXPECT_MIDDLE); 1299 1300 int c1, c2; 1301 char m; 1302 1303 // initially set the indexes into the middle of the strings 1304 iter.setIndexOnly(SRC_MIDDLE); 1305 1306 // move around and compare the iteration code points with 1307 // the expected ones 1308 int movesIndex =0; 1309 while(movesIndex<moves.length()) { 1310 m=moves.charAt(movesIndex++); 1311 if(m=='-') { 1312 c1=iter.previous(); 1313 c2=iter32.previous(); 1314 } else if(m=='0') { 1315 c1=iter.current(); 1316 c2=iter32.current(); 1317 } else /* m=='+' */ { 1318 c1=iter.next(); 1319 c2=iter32.next(); 1320 } 1321 1322 // compare results 1323 if(c1!=c2) { 1324 // copy the moves until the current (m) move, and terminate 1325 String history = moves.substring(0,movesIndex); 1326 errln("error: mismatch in Normalizer iteration at "+history+": " 1327 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2)); 1328 break; 1329 } 1330 1331 // compare indexes 1332 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) { 1333 // copy the moves until the current (m) move, and terminate 1334 String history = moves.substring(0,movesIndex); 1335 errln("error: index mismatch in Normalizer iteration at " 1336 +history+ " : "+ "Normalizer index " +iter.getIndex() 1337 +" expected "+ expectIndex[iter32.getIndex()]); 1338 break; 1339 } 1340 } 1341 } 1342 // Only in ICU4j 1343 @Test TestPreviousNextJCI()1344 public void TestPreviousNextJCI() { 1345 // src and expect strings 1346 char src[]={ 1347 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999), 1348 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f), 1349 0xc4, 1350 0x1ed0 1351 }; 1352 int expect[]={ 1353 0x831d, 1354 0x1d158, 0x1d165, 1355 0x41, 0x308, 1356 0x4f, 0x302, 0x301 1357 }; 1358 1359 // expected src indexes corresponding to expect indexes 1360 int expectIndex[]={ 1361 0, 1362 2, 2, 1363 4, 4, 1364 5, 5, 5, 1365 6 // behind last character 1366 }; 1367 1368 // initial indexes into the src and expect strings 1369 1370 final int SRC_MIDDLE=4; 1371 final int EXPECT_MIDDLE=3; 1372 1373 1374 // movement vector 1375 // - for previous(), 0 for current(), + for next() 1376 // not const so that we can terminate it below for the error message 1377 String moves="0+0+0--0-0-+++0--+++++++0--------"; 1378 1379 // iterators 1380 StringCharacterIterator text = new StringCharacterIterator(new String(src)); 1381 Normalizer iter = new Normalizer(text,Normalizer.NFD,0); 1382 UCharIterator iter32 = new UCharIterator(expect, expect.length, 1383 EXPECT_MIDDLE); 1384 1385 int c1, c2; 1386 char m; 1387 1388 // initially set the indexes into the middle of the strings 1389 iter.setIndexOnly(SRC_MIDDLE); 1390 1391 // move around and compare the iteration code points with 1392 // the expected ones 1393 int movesIndex =0; 1394 while(movesIndex<moves.length()) { 1395 m=moves.charAt(movesIndex++); 1396 if(m=='-') { 1397 c1=iter.previous(); 1398 c2=iter32.previous(); 1399 } else if(m=='0') { 1400 c1=iter.current(); 1401 c2=iter32.current(); 1402 } else /* m=='+' */ { 1403 c1=iter.next(); 1404 c2=iter32.next(); 1405 } 1406 1407 // compare results 1408 if(c1!=c2) { 1409 // copy the moves until the current (m) move, and terminate 1410 String history = moves.substring(0,movesIndex); 1411 errln("error: mismatch in Normalizer iteration at "+history+": " 1412 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2)); 1413 break; 1414 } 1415 1416 // compare indexes 1417 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) { 1418 // copy the moves until the current (m) move, and terminate 1419 String history = moves.substring(0,movesIndex); 1420 errln("error: index mismatch in Normalizer iteration at " 1421 +history+ " : "+ "Normalizer index " +iter.getIndex() 1422 +" expected "+ expectIndex[iter32.getIndex()]); 1423 break; 1424 } 1425 } 1426 } 1427 1428 // test APIs that are not otherwise used - improve test coverage 1429 @Test TestNormalizerAPI()1430 public void TestNormalizerAPI() throws Exception { 1431 try{ 1432 // instantiate a Normalizer from a CharacterIterator 1433 String s=Utility.unescape("a\u0308\uac00\\U0002f800"); 1434 // make s a bit longer and more interesting 1435 UCharacterIterator iter = UCharacterIterator.getInstance(s+s); 1436 Normalizer norm = new Normalizer(iter, Normalizer.NFC,0); 1437 if(norm.next()!=0xe4) { 1438 errln("error in Normalizer(CharacterIterator).next()"); 1439 } 1440 1441 // test clone(), ==, and hashCode() 1442 Normalizer clone=(Normalizer)norm.clone(); 1443 if(clone.equals(norm)) { 1444 errln("error in Normalizer(Normalizer(CharacterIterator)).clone()!=norm"); 1445 } 1446 1447 if(clone.getLength()!= norm.getLength()){ 1448 errln("error in Normalizer.getBeginIndex()"); 1449 } 1450 // clone must have the same hashCode() 1451 //if(clone.hashCode()!=norm.hashCode()) { 1452 // errln("error in Normalizer(Normalizer(CharacterIterator)).clone().hashCode()!=copy.hashCode()"); 1453 //} 1454 if(clone.next()!=0xac00) { 1455 errln("error in Normalizer(Normalizer(CharacterIterator)).next()"); 1456 } 1457 int ch = clone.next(); 1458 if(ch!=0x4e3d) { 1459 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next()"); 1460 } 1461 // position changed, must change hashCode() 1462 if(clone.hashCode()==norm.hashCode()) { 1463 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next().hashCode()==copy.hashCode()"); 1464 } 1465 1466 // test compose() and decompose() 1467 StringBuffer tel; 1468 String nfkc, nfkd; 1469 tel=new StringBuffer("\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121"); 1470 tel.insert(1,(char)0x0301); 1471 1472 nfkc=Normalizer.compose(tel.toString(), true); 1473 nfkd=Normalizer.decompose(tel.toString(), true); 1474 if( 1475 !nfkc.equals(Utility.unescape("TE\u0139TELTELTELTELTELTELTELTELTEL"))|| 1476 !nfkd.equals(Utility.unescape("TEL\u0301TELTELTELTELTELTELTELTELTEL")) 1477 ) { 1478 errln("error in Normalizer::(de)compose(): wrong result(s)"); 1479 } 1480 1481 // test setIndex() 1482 ch=norm.setIndex(3); 1483 if(ch!=0x4e3d) { 1484 errln("error in Normalizer(CharacterIterator).setIndex(3)"); 1485 } 1486 1487 // test setText(CharacterIterator) and getText() 1488 String out, out2; 1489 clone.setText(iter); 1490 1491 out = clone.getText(); 1492 out2 = iter.getText(); 1493 if( !out.equals(out2) || 1494 clone.startIndex()!=0|| 1495 clone.endIndex()!=iter.getLength() 1496 ) { 1497 errln("error in Normalizer::setText() or Normalizer::getText()"); 1498 } 1499 1500 char[] fillIn1 = new char[clone.getLength()]; 1501 char[] fillIn2 = new char[iter.getLength()]; 1502 int len = clone.getText(fillIn1); 1503 iter.getText(fillIn2,0); 1504 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){ 1505 errln("error in Normalizer.getText(). Normalizer: "+ 1506 Utility.hex(new String(fillIn1))+ 1507 " Iter: " + Utility.hex(new String(fillIn2))); 1508 } 1509 1510 clone.setText(fillIn1); 1511 len = clone.getText(fillIn2); 1512 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){ 1513 errln("error in Normalizer.setText() or Normalizer.getText()"+ 1514 Utility.hex(new String(fillIn1))+ 1515 " Iter: " + Utility.hex(new String(fillIn2))); 1516 } 1517 1518 // test setText(UChar *), getUMode() and setMode() 1519 clone.setText(s); 1520 clone.setIndexOnly(1); 1521 clone.setMode(Normalizer.NFD); 1522 if(clone.getMode()!=Normalizer.NFD) { 1523 errln("error in Normalizer::setMode() or Normalizer::getMode()"); 1524 } 1525 if(clone.next()!=0x308 || clone.next()!=0x1100) { 1526 errln("error in Normalizer::setText() or Normalizer::setMode()"); 1527 } 1528 1529 // test last()/previous() with an internal buffer overflow 1530 StringBuffer buf = new StringBuffer("aaaaaaaaaa"); 1531 buf.setCharAt(10-1,'\u0308'); 1532 clone.setText(buf); 1533 if(clone.last()!=0x308) { 1534 errln("error in Normalizer(10*U+0308).last()"); 1535 } 1536 1537 // test UNORM_NONE 1538 norm.setMode(Normalizer.NONE); 1539 if(norm.first()!=0x61 || norm.next()!=0x308 || norm.last()!=0x2f800) { 1540 errln("error in Normalizer(UNORM_NONE).first()/next()/last()"); 1541 } 1542 out=Normalizer.normalize(s, Normalizer.NONE); 1543 if(!out.equals(s)) { 1544 errln("error in Normalizer::normalize(UNORM_NONE)"); 1545 } 1546 ch = 0x1D15E; 1547 String exp = "\\U0001D157\\U0001D165"; 1548 String ns = Normalizer.normalize(ch,Normalizer.NFC); 1549 if(!ns.equals(Utility.unescape(exp))){ 1550 errln("error in Normalizer.normalize(int,Mode)"); 1551 } 1552 ns = Normalizer.normalize(ch,Normalizer.NFC,0); 1553 if(!ns.equals(Utility.unescape(exp))){ 1554 errln("error in Normalizer.normalize(int,Mode,int)"); 1555 } 1556 }catch(Exception e){ 1557 throw e; 1558 } 1559 } 1560 1561 @Test TestConcatenate()1562 public void TestConcatenate() { 1563 1564 Object[][]cases=new Object[][]{ 1565 /* mode, left, right, result */ 1566 { 1567 Normalizer.NFC, 1568 "re", 1569 "\u0301sum\u00e9", 1570 "r\u00e9sum\u00e9" 1571 }, 1572 { 1573 Normalizer.NFC, 1574 "a\u1100", 1575 "\u1161bcdefghijk", 1576 "a\uac00bcdefghijk" 1577 }, 1578 /* ### TODO: add more interesting cases */ 1579 { 1580 Normalizer.NFD, 1581 "\u03B1\u0345", 1582 "\u0C4D\uD804\uDCBA\uD834\uDD69", // 0C4D 110BA 1D169 1583 "\u03B1\uD834\uDD69\uD804\uDCBA\u0C4D\u0345" // 03B1 1D169 110BA 0C4D 0345 1584 } 1585 }; 1586 1587 String left, right, expect, result; 1588 Normalizer.Mode mode; 1589 int i; 1590 1591 /* test concatenation */ 1592 for(i=0; i<cases.length; ++i) { 1593 mode = (Normalizer.Mode)cases[i][0]; 1594 1595 left=(String)cases[i][1]; 1596 right=(String)cases[i][2]; 1597 expect=(String)cases[i][3]; 1598 { 1599 result=Normalizer.concatenate(left, right, mode,0); 1600 if(!result.equals(expect)) { 1601 errln("error in Normalizer.concatenate(), cases[] failed" 1602 +", result==expect: expected: " 1603 + hex(expect)+" =========> got: " + hex(result)); 1604 } 1605 } 1606 { 1607 result=Normalizer.concatenate(left.toCharArray(), right.toCharArray(), mode,0); 1608 if(!result.equals(expect)) { 1609 errln("error in Normalizer.concatenate(), cases[] failed" 1610 +", result==expect: expected: " 1611 + hex(expect)+" =========> got: " + hex(result)); 1612 } 1613 } 1614 } 1615 1616 mode= Normalizer.NFC; // (Normalizer.Mode)cases2[0][0]; 1617 char[] destination = "My resume is here".toCharArray(); 1618 left = "resume"; 1619 right = "re\u0301sum\u00e9 is HERE"; 1620 expect = "My r\u00e9sum\u00e9 is HERE"; 1621 1622 // Concatenates 're' with '\u0301sum\u00e9 is HERE' and places the result at 1623 // position 3 of string 'My resume is here'. 1624 Normalizer.concatenate(left.toCharArray(), 0, 2, right.toCharArray(), 2, 15, 1625 destination, 3, 17, mode, 0); 1626 if(!String.valueOf(destination).equals(expect)) { 1627 errln("error in Normalizer.concatenate(), cases2[] failed" 1628 +", result==expect: expected: " 1629 + hex(expect) + " =========> got: " + hex(destination)); 1630 } 1631 1632 // Error case when result of concatenation won't fit into destination array. 1633 try { 1634 Normalizer.concatenate(left.toCharArray(), 0, 2, right.toCharArray(), 2, 15, 1635 destination, 3, 16, mode, 0); 1636 } catch (IndexOutOfBoundsException e) { 1637 assertTrue("Normalizer.concatenate() failed", e.getMessage().equals("14")); 1638 return; 1639 } 1640 fail("Normalizer.concatenate() tested for failure but passed"); 1641 } 1642 1643 private final int RAND_MAX = 0x7fff; 1644 1645 @Test TestCheckFCD()1646 public void TestCheckFCD() 1647 { 1648 char[] FAST = {0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 1649 0x0008, 0x0009, 0x000A}; 1650 1651 char[] FALSE = {0x0001, 0x0002, 0x02EA, 0x03EB, 0x0300, 0x0301, 1652 0x02B9, 0x0314, 0x0315, 0x0316}; 1653 1654 char[] TRUE = {0x0030, 0x0040, 0x0440, 0x056D, 0x064F, 0x06E7, 1655 0x0050, 0x0730, 0x09EE, 0x1E10}; 1656 1657 char[][] datastr= { {0x0061, 0x030A, 0x1E05, 0x0302, 0}, 1658 {0x0061, 0x030A, 0x00E2, 0x0323, 0}, 1659 {0x0061, 0x0323, 0x00E2, 0x0323, 0}, 1660 {0x0061, 0x0323, 0x1E05, 0x0302, 0} 1661 }; 1662 Normalizer.QuickCheckResult result[] = {Normalizer.YES, Normalizer.NO, Normalizer.NO, Normalizer.YES}; 1663 1664 char[] datachar= { 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 1665 0x6a, 1666 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 1667 0xea, 1668 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 1669 0x0307, 0x0308, 0x0309, 0x030a, 1670 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 1671 0x0327, 0x0328, 0x0329, 0x032a, 1672 0x1e00, 0x1e01, 0x1e02, 0x1e03, 0x1e04, 0x1e05, 0x1e06, 1673 0x1e07, 0x1e08, 0x1e09, 0x1e0a 1674 }; 1675 1676 int count = 0; 1677 1678 if (Normalizer.quickCheck(FAST,0,FAST.length, Normalizer.FCD,0) != Normalizer.YES) 1679 errln("Normalizer.quickCheck(FCD) failed: expected value for fast Normalizer.quickCheck is Normalizer.YES\n"); 1680 if (Normalizer.quickCheck(FALSE,0, FALSE.length,Normalizer.FCD,0) != Normalizer.NO) 1681 errln("Normalizer.quickCheck(FCD) failed: expected value for error Normalizer.quickCheck is Normalizer.NO\n"); 1682 if (Normalizer.quickCheck(TRUE,0,TRUE.length,Normalizer.FCD,0) != Normalizer.YES) 1683 errln("Normalizer.quickCheck(FCD) failed: expected value for correct Normalizer.quickCheck is Normalizer.YES\n"); 1684 1685 1686 while (count < 4) 1687 { 1688 Normalizer.QuickCheckResult fcdresult = Normalizer.quickCheck(datastr[count],0,datastr[count].length, Normalizer.FCD,0); 1689 if (result[count] != fcdresult) { 1690 errln("Normalizer.quickCheck(FCD) failed: Data set "+ count 1691 + " expected value "+ result[count]); 1692 } 1693 count ++; 1694 } 1695 1696 /* random checks of long strings */ 1697 //srand((unsigned)time( NULL )); 1698 Random rand = createRandom(); // use test framework's random 1699 1700 for (count = 0; count < 50; count ++) 1701 { 1702 int size = 0; 1703 Normalizer.QuickCheckResult testresult = Normalizer.YES; 1704 char[] data= new char[20]; 1705 char[] norm= new char[100]; 1706 char[] nfd = new char[100]; 1707 int normStart = 0; 1708 int nfdsize = 0; 1709 while (size != 19) { 1710 data[size] = datachar[rand.nextInt(RAND_MAX)*50/RAND_MAX]; 1711 logln("0x"+data[size]); 1712 normStart += Normalizer.normalize(data,size,size+1, 1713 norm,normStart,100, 1714 Normalizer.NFD,0); 1715 size ++; 1716 } 1717 logln("\n"); 1718 1719 nfdsize = Normalizer.normalize(data,0,size, nfd,0,nfd.length,Normalizer.NFD,0); 1720 // nfdsize = unorm_normalize(data, size, UNORM_NFD, UCOL_IGNORE_HANGUL, 1721 // nfd, 100, &status); 1722 if (nfdsize != normStart || Utility.arrayRegionMatches(nfd,0, norm,0,nfdsize) ==false) { 1723 testresult = Normalizer.NO; 1724 } 1725 if (testresult == Normalizer.YES) { 1726 logln("result Normalizer.YES\n"); 1727 } 1728 else { 1729 logln("result Normalizer.NO\n"); 1730 } 1731 1732 if (Normalizer.quickCheck(data,0,data.length, Normalizer.FCD,0) != testresult) { 1733 errln("Normalizer.quickCheck(FCD) failed: expected "+ testresult +" for random data: "+hex(new String(data)) ); 1734 } 1735 } 1736 } 1737 1738 1739 // reference implementation of Normalizer::compare ref_norm_compare(String s1, String s2, int options)1740 private int ref_norm_compare(String s1, String s2, int options) { 1741 String t1, t2,r1,r2; 1742 1743 int normOptions=options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT; 1744 1745 if((options&Normalizer.COMPARE_IGNORE_CASE)!=0) { 1746 // NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) 1747 r1 = Normalizer.decompose(s1,false,normOptions); 1748 r2 = Normalizer.decompose(s2,false,normOptions); 1749 r1 = UCharacter.foldCase(r1,options); 1750 r2 = UCharacter.foldCase(r2,options); 1751 }else{ 1752 r1 = s1; 1753 r2 = s2; 1754 } 1755 1756 t1 = Normalizer.decompose(r1, false, normOptions); 1757 t2 = Normalizer.decompose(r2, false, normOptions); 1758 1759 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) { 1760 UTF16.StringComparator comp 1761 = new UTF16.StringComparator(true, false, 1762 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1763 return comp.compare(t1,t2); 1764 } else { 1765 return t1.compareTo(t2); 1766 } 1767 1768 } 1769 1770 // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately norm_compare(String s1, String s2, int options)1771 private int norm_compare(String s1, String s2, int options) { 1772 int normOptions=options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT; 1773 1774 if( Normalizer.YES==Normalizer.quickCheck(s1,Normalizer.FCD,normOptions) && 1775 Normalizer.YES==Normalizer.quickCheck(s2,Normalizer.FCD,normOptions)) { 1776 options|=Normalizer.INPUT_IS_FCD; 1777 } 1778 1779 int cmpStrings = Normalizer.compare(s1, s2, options); 1780 int cmpArrays = Normalizer.compare( 1781 s1.toCharArray(), 0, s1.length(), 1782 s2.toCharArray(), 0, s2.length(), options); 1783 assertEquals("compare strings == compare char arrays", cmpStrings, cmpArrays); 1784 return cmpStrings; 1785 } 1786 1787 // reference implementation of UnicodeString::caseCompare ref_case_compare(String s1, String s2, int options)1788 private int ref_case_compare(String s1, String s2, int options) { 1789 String t1, t2; 1790 1791 t1=s1; 1792 t2=s2; 1793 1794 t1 = UCharacter.foldCase(t1,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0)); 1795 t2 = UCharacter.foldCase(t2,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0)); 1796 1797 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) { 1798 UTF16.StringComparator comp 1799 = new UTF16.StringComparator(true, false, 1800 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1801 return comp.compare(t1,t2); 1802 } else { 1803 return t1.compareTo(t2); 1804 } 1805 1806 } 1807 1808 // reduce an integer to -1/0/1 sign(int value)1809 private static int sign(int value) { 1810 if(value==0) { 1811 return 0; 1812 } else { 1813 return (value>>31)|1; 1814 } 1815 } signString(int value)1816 private static String signString(int value) { 1817 if(value<0) { 1818 return "<0"; 1819 } else if(value==0) { 1820 return "=0"; 1821 } else /* value>0 */ { 1822 return ">0"; 1823 } 1824 } 1825 // test Normalizer::compare and unorm_compare (thinly wrapped by the former) 1826 // by comparing it with its semantic equivalent 1827 // since we trust the pieces, this is sufficient 1828 1829 // test each string with itself and each other 1830 // each time with all options 1831 private String strings[]=new String[]{ 1832 // some cases from NormalizationTest.txt 1833 // 0..3 1834 "D\u031B\u0307\u0323", 1835 "\u1E0C\u031B\u0307", 1836 "D\u031B\u0323\u0307", 1837 "d\u031B\u0323\u0307", 1838 1839 // 4..6 1840 "\u00E4", 1841 "a\u0308", 1842 "A\u0308", 1843 1844 // Angstrom sign = A ring 1845 // 7..10 1846 "\u212B", 1847 "\u00C5", 1848 "A\u030A", 1849 "a\u030A", 1850 1851 // 11.14 1852 "a\u059A\u0316\u302A\u032Fb", 1853 "a\u302A\u0316\u032F\u059Ab", 1854 "a\u302A\u0316\u032F\u059Ab", 1855 "A\u059A\u0316\u302A\u032Fb", 1856 1857 // from ICU case folding tests 1858 // 15..20 1859 "A\u00df\u00b5\ufb03\\U0001040c\u0131", 1860 "ass\u03bcffi\\U00010434i", 1861 "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff", 1862 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udfff", 1863 "\u0041\u0062\u0131\u03c3\u0053\u0073\u0066\u0046\u0069\ud93f\udfff", 1864 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udffd", 1865 1866 // U+d800 U+10001 see implementation comment in unorm_cmpEquivFold 1867 // vs. U+10000 at bottom - code point order 1868 // 21..22 1869 "\ud800\ud800\udc01", 1870 "\ud800\udc00", 1871 1872 // other code point order tests from ustrtest.cpp 1873 // 23..31 1874 "\u20ac\ud801", 1875 "\u20ac\ud800\udc00", 1876 "\ud800", 1877 "\ud800\uff61", 1878 "\udfff", 1879 "\uff61\udfff", 1880 "\uff61\ud800\udc02", 1881 "\ud800\udc02", 1882 "\ud84d\udc56", 1883 1884 // long strings, see cnormtst.c/TestNormCoverage() 1885 // equivalent if case-insensitive 1886 // 32..33 1887 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1888 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1889 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1890 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1891 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1892 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1893 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1894 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1895 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1896 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1897 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1898 "d\u031B\u0307\u0323", 1899 1900 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+ 1901 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1902 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1903 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1904 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1905 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1906 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1907 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1908 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1909 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1910 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+ 1911 "\u1E0C\u031B\u0307", 1912 1913 // some strings that may make a difference whether the compare function 1914 // case-folds or decomposes first 1915 // 34..41 1916 "\u0360\u0345\u0334", 1917 "\u0360\u03b9\u0334", 1918 1919 "\u0360\u1f80\u0334", 1920 "\u0360\u03b1\u0313\u03b9\u0334", 1921 1922 "\u0360\u1ffc\u0334", 1923 "\u0360\u03c9\u03b9\u0334", 1924 1925 "a\u0360\u0345\u0360\u0345b", 1926 "a\u0345\u0360\u0345\u0360b", 1927 1928 // interesting cases for canonical caseless match with turkic i handling 1929 // 42..43 1930 "\u00cc", 1931 "\u0069\u0300", 1932 1933 // strings with post-Unicode 3.2 normalization or normalization corrections 1934 // 44..45 1935 "\u00e4\u193b\\U0002f868", 1936 "\u0061\u193b\u0308\u36fc", 1937 1938 1939 }; 1940 1941 // all combinations of options 1942 // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions 1943 final class Temp { 1944 int options; 1945 String name; Temp(int opt,String str)1946 public Temp(int opt,String str){ 1947 options =opt; 1948 name = str; 1949 } 1950 1951 } 1952 // set UNORM_UNICODE_3_2 in one additional combination 1953 1954 private Temp[] opt = new Temp[]{ 1955 new Temp(0,"default"), 1956 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER, "code point order" ), 1957 new Temp(Normalizer.COMPARE_IGNORE_CASE, "ignore case" ), 1958 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE, "code point order & ignore case" ), 1959 new Temp(Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i"), 1960 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "code point order & ignore case & special i"), 1961 new Temp(Normalizer.UNICODE_3_2 << Normalizer.COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2") 1962 }; 1963 1964 1965 @Test TestCompareDebug()1966 public void TestCompareDebug(){ 1967 1968 String[] s = new String[100]; // at least as many items as in strings[] ! 1969 1970 1971 int i, j, k, count=strings.length; 1972 int result, refResult; 1973 1974 // create the UnicodeStrings 1975 for(i=0; i<count; ++i) { 1976 s[i]=Utility.unescape(strings[i]); 1977 } 1978 UTF16.StringComparator comp = new UTF16.StringComparator(true, false, 1979 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1980 // test them each with each other 1981 1982 i = 42; 1983 j = 43; 1984 k = 2; 1985 // test Normalizer::compare 1986 result=norm_compare(s[i], s[j], opt[k].options); 1987 refResult=ref_norm_compare(s[i], s[j], opt[k].options); 1988 if(sign(result)!=sign(refResult)) { 1989 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 1990 } 1991 1992 // test UnicodeString::caseCompare - same internal implementation function 1993 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) { 1994 // result=s[i]. (s[j], opt[k].options); 1995 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 1996 { 1997 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 1998 } 1999 else { 2000 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 2001 } 2002 2003 result=comp.compare(s[i],s[j]); 2004 refResult=ref_case_compare(s[i], s[j], opt[k].options); 2005 if(sign(result)!=sign(refResult)) { 2006 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 2007 } 2008 } 2009 String value1 = "\u00dater\u00fd"; 2010 String value2 = "\u00fater\u00fd"; 2011 if(Normalizer.compare(value1,value2,0)!=0){ 2012 if(Normalizer.compare(value1,value2,Normalizer.COMPARE_IGNORE_CASE)==0){ 2013 2014 } 2015 } 2016 } 2017 2018 @Test TestCompare()2019 public void TestCompare() { 2020 2021 String[] s = new String[100]; // at least as many items as in strings[] ! 2022 2023 int i, j, k, count=strings.length; 2024 int result, refResult; 2025 2026 // create the UnicodeStrings 2027 for(i=0; i<count; ++i) { 2028 s[i]=Utility.unescape(strings[i]); 2029 } 2030 UTF16.StringComparator comp = new UTF16.StringComparator(); 2031 // test them each with each other 2032 for(i=0; i<count; ++i) { 2033 for(j=i; j<count; ++j) { 2034 for(k=0; k<opt.length; ++k) { 2035 // test Normalizer::compare 2036 result=norm_compare(s[i], s[j], opt[k].options); 2037 refResult=ref_norm_compare(s[i], s[j], opt[k].options); 2038 if(sign(result)!=sign(refResult)) { 2039 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 2040 } 2041 2042 // test UnicodeString::caseCompare - same internal implementation function 2043 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) { 2044 // result=s[i]. (s[j], opt[k].options); 2045 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 2046 { 2047 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 2048 } 2049 else { 2050 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 2051 } 2052 2053 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0); 2054 // result=comp.caseCompare(s[i],s[j], opt[k].options); 2055 result=comp.compare(s[i],s[j]); 2056 refResult=ref_case_compare(s[i], s[j], opt[k].options); 2057 if(sign(result)!=sign(refResult)) { 2058 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 2059 } 2060 } 2061 } 2062 } 2063 } 2064 2065 // test cases with i and I to make sure Turkic works 2066 char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 }; 2067 UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet(); 2068 Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl; 2069 nfcImpl.ensureCanonIterData(); 2070 2071 String s1, s2; 2072 2073 // collect all sets into one for contiguous output 2074 for(i=0; i<iI.length; ++i) { 2075 if(nfcImpl.getCanonStartSet(iI[i], iSet)) { 2076 set.addAll(iSet); 2077 } 2078 } 2079 2080 // test all of these precomposed characters 2081 Normalizer2 nfcNorm2 = Normalizer2.getNFCInstance(); 2082 UnicodeSetIterator it = new UnicodeSetIterator(set); 2083 int c; 2084 while(it.next() && (c=it.codepoint)!=UnicodeSetIterator.IS_STRING) { 2085 s1 = UTF16.valueOf(c); 2086 s2 = nfcNorm2.getDecomposition(c); 2087 for(k=0; k<opt.length; ++k) { 2088 // test Normalizer::compare 2089 2090 result= norm_compare(s1, s2, opt[k].options); 2091 refResult=ref_norm_compare(s1, s2, opt[k].options); 2092 if(sign(result)!=sign(refResult)) { 2093 errln("Normalizer.compare(U+"+hex(c)+" with its NFD, "+opt[k].name+")" 2094 + signString(result)+" should be "+signString(refResult)); 2095 } 2096 2097 // test UnicodeString::caseCompare - same internal implementation function 2098 if((opt[k].options & Normalizer.COMPARE_IGNORE_CASE)>0) { 2099 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 2100 { 2101 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 2102 } 2103 else { 2104 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 2105 } 2106 2107 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0); 2108 2109 result=comp.compare(s1,s2); 2110 refResult=ref_case_compare(s1, s2, opt[k].options); 2111 if(sign(result)!=sign(refResult)) { 2112 errln("UTF16.compare(U+"+hex(c)+" with its NFD, " 2113 +opt[k].name+")"+signString(result) +" should be "+signString(refResult)); 2114 } 2115 } 2116 } 2117 } 2118 2119 // test getDecomposition() for some characters that do not decompose 2120 if( nfcNorm2.getDecomposition(0x20)!=null || 2121 nfcNorm2.getDecomposition(0x4e00)!=null || 2122 nfcNorm2.getDecomposition(0x20002)!=null 2123 ) { 2124 errln("NFC.getDecomposition() returns TRUE for characters which do not have decompositions"); 2125 } 2126 2127 // test getRawDecomposition() for some characters that do not decompose 2128 if( nfcNorm2.getRawDecomposition(0x20)!=null || 2129 nfcNorm2.getRawDecomposition(0x4e00)!=null || 2130 nfcNorm2.getRawDecomposition(0x20002)!=null 2131 ) { 2132 errln("getRawDecomposition() returns TRUE for characters which do not have decompositions"); 2133 } 2134 2135 // test composePair() for some pairs of characters that do not compose 2136 if( nfcNorm2.composePair(0x20, 0x301)>=0 || 2137 nfcNorm2.composePair(0x61, 0x305)>=0 || 2138 nfcNorm2.composePair(0x1100, 0x1160)>=0 || 2139 nfcNorm2.composePair(0xac00, 0x11a7)>=0 2140 ) { 2141 errln("NFC.composePair() incorrectly composes some pairs of characters"); 2142 } 2143 2144 // test FilteredNormalizer2.getDecomposition() 2145 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]"); 2146 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2147 if(fn2.getDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getDecomposition(0x100))) { 2148 errln("FilteredNormalizer2(NFC, ^A0-FF).getDecomposition() failed"); 2149 } 2150 2151 // test FilteredNormalizer2.getRawDecomposition() 2152 if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) { 2153 errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed"); 2154 } 2155 2156 // test FilteredNormalizer2::composePair() 2157 if( 0x100!=fn2.composePair(0x41, 0x304) || 2158 fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08 2159 ) { 2160 errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed"); 2161 } 2162 } 2163 2164 // verify that case-folding does not un-FCD strings countFoldFCDExceptions(int foldingOptions)2165 int countFoldFCDExceptions(int foldingOptions) { 2166 String s, d; 2167 int c; 2168 int count; 2169 int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC; 2170 Normalizer.QuickCheckResult qcResult; 2171 int category; 2172 boolean isNFD; 2173 2174 2175 logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions)); 2176 2177 count=0; 2178 for(c=0; c<=0x10ffff; ++c) { 2179 category=UCharacter.getType(c); 2180 if(category==UCharacterCategory.UNASSIGNED) { 2181 continue; // skip unassigned code points 2182 } 2183 if(c==0xac00) { 2184 c=0xd7a3; // skip Hangul - no case folding there 2185 continue; 2186 } 2187 // skip Han blocks - no case folding there either 2188 if(c==0x3400) { 2189 c=0x4db5; 2190 continue; 2191 } 2192 if(c==0x4e00) { 2193 c=0x9fa5; 2194 continue; 2195 } 2196 if(c==0x20000) { 2197 c=0x2a6d6; 2198 continue; 2199 } 2200 2201 s= UTF16.valueOf(c); 2202 2203 // get leading and trailing cc for c 2204 d= Normalizer.decompose(s,false); 2205 isNFD= s==d; 2206 cc=UCharacter.getCombiningClass(UTF16.charAt(d,0)); 2207 trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1)); 2208 2209 // get leading and trailing cc for the case-folding of c 2210 UCharacter.foldCase(s,(foldingOptions==0)); 2211 d = Normalizer.decompose(s, false); 2212 foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0)); 2213 foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1)); 2214 2215 qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0); 2216 2217 2218 // bad: 2219 // - character maps to empty string: adjacent characters may then need reordering 2220 // - folding has different leading/trailing cc's, and they don't become just 0 2221 // - folding itself is not FCD 2222 if( qcResult!=Normalizer.YES || 2223 s.length()==0 || 2224 (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0) 2225 ) { 2226 ++count; 2227 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")"); 2228 //errln(" cc %02x trailCC %02x foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult); 2229 continue; 2230 } 2231 2232 // also bad: 2233 // if a code point is in NFD but its case folding is not, then 2234 // unorm_compare will also fail 2235 if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) { 2236 ++count; 2237 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")"); 2238 } 2239 } 2240 2241 logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" ); 2242 return count; 2243 } 2244 2245 @Test TestFindFoldFCDExceptions()2246 public void TestFindFoldFCDExceptions() { 2247 int count; 2248 2249 count=countFoldFCDExceptions(0); 2250 count+=countFoldFCDExceptions(Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I); 2251 if(count>0) { 2252 //* 2253 //* If case-folding un-FCDs any strings, then unorm_compare() must be 2254 //* re-implemented. 2255 //* It currently assumes that one can check for FCD then case-fold 2256 //* and then still have FCD strings for raw decomposition without reordering. 2257 //* 2258 errln("error: There are "+count+" code points for which case-folding"+ 2259 " may un-FCD a string for all folding options.\n See comment"+ 2260 " in BasicNormalizerTest::FindFoldFCDExceptions()!"); 2261 } 2262 } 2263 2264 @Test TestCombiningMarks()2265 public void TestCombiningMarks(){ 2266 String src = "\u0f71\u0f72\u0f73\u0f74\u0f75"; 2267 String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74"; 2268 String result = Normalizer.decompose(src,false); 2269 if(!expected.equals(result)){ 2270 errln("Reordering of combining marks failed. Expected: "+Utility.hex(expected)+" Got: "+ Utility.hex(result)); 2271 } 2272 } 2273 2274 /* 2275 * Re-enable this test when UTC fixes UAX 21 2276 @Test 2277 public void TestUAX21Failure(){ 2278 final String[][] cases = new String[][]{ 2279 {"\u0061\u0345\u0360\u0345\u0062", "\u0061\u0360\u0345\u0345\u0062"}, 2280 {"\u0061\u0345\u0345\u0360\u0062", "\u0061\u0360\u0345\u0345\u0062"}, 2281 {"\u0061\u0345\u0360\u0362\u0360\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"}, 2282 {"\u0061\u0360\u0345\u0360\u0362\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"}, 2283 {"\u0061\u0345\u0360\u0362\u0361\u0062", "\u0061\u0362\u0360\u0361\u0345\u0062"}, 2284 {"\u0061\u0361\u0345\u0360\u0362\u0062", "\u0061\u0362\u0361\u0360\u0345\u0062"}, 2285 }; 2286 for(int i = 0; i< cases.length; i++){ 2287 String s1 =cases[0][0]; 2288 String s2 = cases[0][1]; 2289 if( (Normalizer.compare(s1,s2,Normalizer.FOLD_CASE_DEFAULT ==0)//case sensitive compare 2290 && 2291 (Normalizer.compare(s1,s2,Normalizer.COMPARE_IGNORE_CASE)!=0)){ 2292 errln("Normalizer.compare() failed for s1: " 2293 + Utility.hex(s1) +" s2: " + Utility.hex(s2)); 2294 } 2295 } 2296 } 2297 */ 2298 @Test TestFCNFKCClosure()2299 public void TestFCNFKCClosure() { 2300 final class TestStruct{ 2301 int c; 2302 String s; 2303 TestStruct(int cp, String src){ 2304 c=cp; 2305 s=src; 2306 } 2307 } 2308 2309 TestStruct[] tests= new TestStruct[]{ 2310 new TestStruct( 0x00C4, "" ), 2311 new TestStruct( 0x00E4, "" ), 2312 new TestStruct( 0x037A, "\u0020\u03B9" ), 2313 new TestStruct( 0x03D2, "\u03C5" ), 2314 new TestStruct( 0x20A8, "\u0072\u0073" ) , 2315 new TestStruct( 0x210B, "\u0068" ), 2316 new TestStruct( 0x210C, "\u0068" ), 2317 new TestStruct( 0x2121, "\u0074\u0065\u006C" ), 2318 new TestStruct( 0x2122, "\u0074\u006D" ), 2319 new TestStruct( 0x2128, "\u007A" ), 2320 new TestStruct( 0x1D5DB,"\u0068" ), 2321 new TestStruct( 0x1D5ED,"\u007A" ), 2322 new TestStruct( 0x0061, "" ) 2323 }; 2324 2325 2326 for(int i = 0; i < tests.length; ++ i) { 2327 String result=Normalizer.getFC_NFKC_Closure(tests[i].c); 2328 if(!result.equals(new String(tests[i].s))) { 2329 errln("getFC_NFKC_Closure(U+"+Integer.toHexString(tests[i].c)+") is wrong"); 2330 } 2331 } 2332 2333 /* error handling */ 2334 2335 int length=Normalizer.getFC_NFKC_Closure(0x5c, null); 2336 if(length!=0){ 2337 errln("getFC_NFKC_Closure did not perform error handling correctly"); 2338 } 2339 } 2340 @Test TestBugJ2324()2341 public void TestBugJ2324(){ 2342 /* String[] input = new String[]{ 2343 //"\u30FD\u3099", 2344 "\u30FA\u309A", 2345 "\u30FB\u309A", 2346 "\u30FC\u309A", 2347 "\u30FE\u309A", 2348 "\u30FD\u309A", 2349 2350 };*/ 2351 String troublesome = "\u309A"; 2352 for(int i=0x3000; i<0x3100;i++){ 2353 String input = ((char)i)+troublesome; 2354 try{ 2355 /* String result =*/ Normalizer.compose(input,false); 2356 }catch(IndexOutOfBoundsException e){ 2357 errln("compose() failed for input: " + Utility.hex(input) + " Exception: " + e.toString()); 2358 } 2359 } 2360 2361 } 2362 2363 static final int D = 0, C = 1, KD= 2, KC = 3, FCD=4, NONE=5; 2364 initSkippables(UnicodeSet[] skipSets)2365 private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets) { 2366 skipSets[D].applyPattern("[[:NFD_QC=Yes:]&[:ccc=0:]]", false); 2367 skipSets[C].applyPattern("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false); 2368 skipSets[KD].applyPattern("[[:NFKD_QC=Yes:]&[:ccc=0:]]", false); 2369 skipSets[KC].applyPattern("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false); 2370 2371 // Remove from the NFC and NFKC sets all those characters that change 2372 // when a back-combining character is added. 2373 // First, get all of the back-combining characters and their combining classes. 2374 UnicodeSet combineBack=new UnicodeSet("[:NFC_QC=Maybe:]"); 2375 int numCombineBack=combineBack.size(); 2376 int[] combineBackCharsAndCc=new int[numCombineBack*2]; 2377 UnicodeSetIterator iter=new UnicodeSetIterator(combineBack); 2378 for(int i=0; i<numCombineBack; ++i) { 2379 iter.next(); 2380 int c=iter.codepoint; 2381 combineBackCharsAndCc[2*i]=c; 2382 combineBackCharsAndCc[2*i+1]=UCharacter.getCombiningClass(c); 2383 } 2384 2385 // We need not look at control codes, Han characters nor Hangul LVT syllables because they 2386 // do not combine forward. LV syllables are already removed. 2387 UnicodeSet notInteresting=new UnicodeSet("[[:C:][:Unified_Ideograph:][:HST=LVT:]]"); 2388 UnicodeSet unsure=((UnicodeSet)(skipSets[C].clone())).removeAll(notInteresting); 2389 // System.out.format("unsure.size()=%d\n", unsure.size()); 2390 2391 // For each character about which we are unsure, see if it changes when we add 2392 // one of the back-combining characters. 2393 Normalizer2 norm2=Normalizer2.getNFCInstance(); 2394 StringBuilder s=new StringBuilder(); 2395 iter.reset(unsure); 2396 while(iter.next()) { 2397 int c=iter.codepoint; 2398 s.delete(0, 0x7fffffff).appendCodePoint(c); 2399 int cLength=s.length(); 2400 int tccc=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS); 2401 for(int i=0; i<numCombineBack; ++i) { 2402 // If c's decomposition ends with a character with non-zero combining class, then 2403 // c can only change if it combines with a character with a non-zero combining class. 2404 int cc2=combineBackCharsAndCc[2*i+1]; 2405 if(tccc==0 || cc2!=0) { 2406 int c2=combineBackCharsAndCc[2*i]; 2407 s.appendCodePoint(c2); 2408 if(!norm2.isNormalized(s)) { 2409 // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2); 2410 skipSets[C].remove(c); 2411 skipSets[KC].remove(c); 2412 break; 2413 } 2414 s.delete(cLength, 0x7fffffff); 2415 } 2416 } 2417 } 2418 return skipSets; 2419 } 2420 2421 private static String[] kModeStrings = { 2422 "D", "C", "KD", "KC" 2423 }; 2424 2425 @Test TestSkippable()2426 public void TestSkippable() { 2427 UnicodeSet[] skipSets = new UnicodeSet[] { 2428 new UnicodeSet(), //NFD 2429 new UnicodeSet(), //NFC 2430 new UnicodeSet(), //NFKD 2431 new UnicodeSet() //NFKC 2432 }; 2433 UnicodeSet[] expectSets = new UnicodeSet[] { 2434 new UnicodeSet(), 2435 new UnicodeSet(), 2436 new UnicodeSet(), 2437 new UnicodeSet() 2438 }; 2439 StringBuilder s, pattern; 2440 2441 // build NF*Skippable sets from runtime data 2442 skipSets[D].applyPattern("[:NFD_Inert:]"); 2443 skipSets[C].applyPattern("[:NFC_Inert:]"); 2444 skipSets[KD].applyPattern("[:NFKD_Inert:]"); 2445 skipSets[KC].applyPattern("[:NFKC_Inert:]"); 2446 2447 expectSets = initSkippables(expectSets); 2448 if(expectSets[D].contains(0x0350)){ 2449 errln("expectSets[D] contains 0x0350"); 2450 } 2451 for(int i=0; i<expectSets.length; ++i) { 2452 if(!skipSets[i].equals(expectSets[i])) { 2453 String ms = kModeStrings[i]; 2454 errln("error: TestSkippable skipSets["+ms+"]!=expectedSets["+ms+"]\n"); 2455 // Note: This used to depend on hardcoded UnicodeSet patterns generated by 2456 // Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by 2457 // running com.ibm.text.UCD.Main with the option NFSkippable. 2458 // Since ICU 4.6/Unicode 6, we are generating the 2459 // expectSets ourselves in initSkippables(). 2460 2461 s=new StringBuilder(); 2462 2463 s.append("\n\nskip= "); 2464 s.append(skipSets[i].toPattern(true)); 2465 s.append("\n\n"); 2466 2467 s.append("skip-expect="); 2468 pattern = new StringBuilder(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true)); 2469 s.append(pattern); 2470 2471 pattern.delete(0,pattern.length()); 2472 s.append("\n\nexpect-skip="); 2473 pattern = new StringBuilder(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true)); 2474 s.append(pattern); 2475 s.append("\n\n"); 2476 2477 pattern.delete(0,pattern.length()); 2478 s.append("\n\nintersection(expect,skip)="); 2479 UnicodeSet intersection = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]); 2480 pattern = new StringBuilder(intersection.toPattern(true)); 2481 s.append(pattern); 2482 // Special: test coverage for append(char). 2483 s.append('\n'); 2484 s.append('\n'); 2485 2486 errln(s.toString()); 2487 } 2488 } 2489 } 2490 2491 @Test TestBugJ2068()2492 public void TestBugJ2068(){ 2493 String sample = "The quick brown fox jumped over the lazy dog"; 2494 UCharacterIterator text = UCharacterIterator.getInstance(sample); 2495 Normalizer norm = new Normalizer(text,Normalizer.NFC,0); 2496 text.setIndex(4); 2497 if(text.current() == norm.current()){ 2498 errln("Normalizer is not cloning the UCharacterIterator"); 2499 } 2500 } 2501 @Test TestGetCombiningClass()2502 public void TestGetCombiningClass(){ 2503 for(int i=0;i<0x10FFFF;i++){ 2504 int cc = UCharacter.getCombiningClass(i); 2505 if(0xD800<= i && i<=0xDFFF && cc >0 ){ 2506 cc = UCharacter.getCombiningClass(i); 2507 errln("CC: "+ cc + " for codepoint: " +Utility.hex(i,8)); 2508 } 2509 } 2510 } 2511 2512 @Test TestSerializedSet()2513 public void TestSerializedSet(){ 2514 USerializedSet sset=new USerializedSet(); 2515 UnicodeSet set = new UnicodeSet(); 2516 int start, end; 2517 2518 char[] serialized = { 2519 0x8007, // length 2520 3, // bmpLength 2521 0xc0, 0xfe, 0xfffc, 2522 1, 9, 0x10, 0xfffc 2523 }; 2524 sset.getSet(serialized, 0); 2525 2526 // collect all sets into one for contiguous output 2527 int[] startEnd = new int[2]; 2528 int count=sset.countRanges(); 2529 for(int j=0; j<count; ++j) { 2530 sset.getRange(j, startEnd); 2531 set.add(startEnd[0], startEnd[1]); 2532 } 2533 2534 // test all of these characters 2535 UnicodeSetIterator it = new UnicodeSetIterator(set); 2536 while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) { 2537 start=it.codepoint; 2538 end=it.codepointEnd; 2539 while(start<=end) { 2540 if(!sset.contains(start)){ 2541 errln("USerializedSet.contains failed for "+Utility.hex(start,8)); 2542 } 2543 ++start; 2544 } 2545 } 2546 } 2547 2548 @Test TestReturnFailure()2549 public void TestReturnFailure(){ 2550 char[] term = {'r','\u00e9','s','u','m','\u00e9' }; 2551 char[] decomposed_term = new char[10 + term.length + 2]; 2552 int rc = Normalizer.decompose(term,0,term.length, decomposed_term,0,decomposed_term.length,true, 0); 2553 int rc1 = Normalizer.decompose(term,0,term.length, decomposed_term,10,decomposed_term.length,true, 0); 2554 if(rc!=rc1){ 2555 errln("Normalizer decompose did not return correct length"); 2556 } 2557 } 2558 2559 private final static class TestCompositionCase { 2560 public Normalizer.Mode mode; 2561 public int options; 2562 public String input, expect; TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect)2563 TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect) { 2564 this.mode=mode; 2565 this.options=options; 2566 this.input=input; 2567 this.expect=expect; 2568 } 2569 } 2570 2571 @Test TestComposition()2572 public void TestComposition() { 2573 final TestCompositionCase cases[]=new TestCompositionCase[]{ 2574 /* 2575 * special cases for UAX #15 bug 2576 * see Unicode Corrigendum #5: Normalization Idempotency 2577 * at http://unicode.org/versions/corrigendum5.html 2578 * (was Public Review Issue #29) 2579 */ 2580 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327", "\u1100\u0300\u1161\u0327"), 2581 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"), 2582 new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8", "\uac00\u0327\u0300\u11a8"), 2583 new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"), 2584 2585 /* TODO: add test cases for UNORM_FCC here (j2151) */ 2586 }; 2587 2588 String output; 2589 int i; 2590 2591 for(i=0; i<cases.length; ++i) { 2592 output=Normalizer.normalize(cases[i].input, cases[i].mode, cases[i].options); 2593 if(!output.equals(cases[i].expect)) { 2594 errln("unexpected result for case "+i); 2595 } 2596 } 2597 } 2598 2599 @Test TestGetDecomposition()2600 public void TestGetDecomposition() { 2601 Normalizer2 n2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE_CONTIGUOUS); 2602 String decomp=n2.getDecomposition(0x20); 2603 assertEquals("fcc.getDecomposition(space) failed", null, decomp); 2604 decomp=n2.getDecomposition(0xe4); 2605 assertEquals("fcc.getDecomposition(a-umlaut) failed", "a\u0308", decomp); 2606 decomp=n2.getDecomposition(0xac01); 2607 assertEquals("fcc.getDecomposition(Hangul syllable U+AC01) failed", "\u1100\u1161\u11a8", decomp); 2608 } 2609 2610 @Test TestGetRawDecomposition()2611 public void TestGetRawDecomposition() { 2612 Normalizer2 n2=Normalizer2.getNFKCInstance(); 2613 /* 2614 * Raw decompositions from NFKC data are the Unicode Decomposition_Mapping values, 2615 * without recursive decomposition. 2616 */ 2617 2618 String decomp=n2.getRawDecomposition(0x20); 2619 assertEquals("nfkc.getRawDecomposition(space) failed", null, decomp); 2620 decomp=n2.getRawDecomposition(0xe4); 2621 assertEquals("nfkc.getRawDecomposition(a-umlaut) failed", "a\u0308", decomp); 2622 /* U+1E08 LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE */ 2623 decomp=n2.getRawDecomposition(0x1e08); 2624 assertEquals("nfkc.getRawDecomposition(c-cedilla-acute) failed", "\u00c7\u0301", decomp); 2625 /* U+212B ANGSTROM SIGN */ 2626 decomp=n2.getRawDecomposition(0x212b); 2627 assertEquals("nfkc.getRawDecomposition(angstrom sign) failed", "\u00c5", decomp); 2628 decomp=n2.getRawDecomposition(0xac00); 2629 assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC00) failed", "\u1100\u1161", decomp); 2630 /* A Hangul LVT syllable has a raw decomposition of an LV syllable + T. */ 2631 decomp=n2.getRawDecomposition(0xac01); 2632 assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC01) failed", "\uac00\u11a8", decomp); 2633 } 2634 2635 @Test TestCustomComp()2636 public void TestCustomComp() { 2637 String [][] pairs={ 2638 // ICU 63 normalization with CodePointTrie requires inert surrogate code points. 2639 // { "\\uD801\\uE000\\uDFFE", "" }, 2640 // { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, 2641 // { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, 2642 { "\\uD801\\uE000\\uDFFE", "\\uD801\\uDFFE" }, 2643 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD800\\uD801\\uDFFE\\uDFFF" }, 2644 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD800\\U000107FE\\uDFFF" }, 2645 2646 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" }, 2647 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, 2648 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, 2649 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, 2650 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } 2651 }; 2652 Normalizer2 customNorm2; 2653 customNorm2= 2654 Normalizer2.getInstance( 2655 BasicTest.class.getResourceAsStream("/ohos/global/icu/dev/data/testdata/testnorm.nrm"), 2656 "testnorm", 2657 Normalizer2.Mode.COMPOSE); 2658 for(int i=0; i<pairs.length; ++i) { 2659 String[] pair=pairs[i]; 2660 String input=Utility.unescape(pair[0]); 2661 String expected=Utility.unescape(pair[1]); 2662 String result=customNorm2.normalize(input); 2663 if(!result.equals(expected)) { 2664 errln("custom compose Normalizer2 did not normalize input "+i+" as expected"); 2665 } 2666 } 2667 } 2668 2669 @Test TestCustomFCC()2670 public void TestCustomFCC() { 2671 String[][] pairs={ 2672 // ICU 63 normalization with CodePointTrie requires inert surrogate code points. 2673 // { "\\uD801\\uE000\\uDFFE", "" }, 2674 // { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, 2675 // { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, 2676 { "\\uD801\\uE000\\uDFFE", "\\uD801\\uDFFE" }, 2677 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD800\\uD801\\uDFFE\\uDFFF" }, 2678 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD800\\U000107FE\\uDFFF" }, 2679 2680 // The following expected result is different from CustomComp 2681 // because of only-contiguous composition. 2682 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" }, 2683 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, 2684 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, 2685 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, 2686 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } 2687 }; 2688 Normalizer2 customNorm2; 2689 customNorm2= 2690 Normalizer2.getInstance( 2691 BasicTest.class.getResourceAsStream("/ohos/global/icu/dev/data/testdata/testnorm.nrm"), 2692 "testnorm", 2693 Normalizer2.Mode.COMPOSE_CONTIGUOUS); 2694 for(int i=0; i<pairs.length; ++i) { 2695 String[] pair=pairs[i]; 2696 String input=Utility.unescape(pair[0]); 2697 String expected=Utility.unescape(pair[1]); 2698 String result=customNorm2.normalize(input); 2699 if(!result.equals(expected)) { 2700 errln("custom FCC Normalizer2 did not normalize input "+i+" as expected"); 2701 } 2702 } 2703 } 2704 2705 @Test TestCanonIterData()2706 public void TestCanonIterData() { 2707 // For now, just a regression test. 2708 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl.ensureCanonIterData(); 2709 // U+0FB5 TIBETAN SUBJOINED LETTER SSA is the trailing character 2710 // in some decomposition mappings where there is a composition exclusion. 2711 // In fact, U+0FB5 is normalization-inert (NFC_QC=Yes, NFD_QC=Yes, ccc=0) 2712 // but it is not a segment starter because it occurs in a decomposition mapping. 2713 if(impl.isCanonSegmentStarter(0xfb5)) { 2714 errln("isCanonSegmentStarter(U+0fb5)=true is wrong"); 2715 } 2716 // For [:Segment_Starter:] to work right, not just the property function has to work right, 2717 // UnicodeSet also needs a correct range starts set. 2718 UnicodeSet segStarters=new UnicodeSet("[:Segment_Starter:]").freeze(); 2719 if(segStarters.contains(0xfb5)) { 2720 errln("[:Segment_Starter:].contains(U+0fb5)=true is wrong"); 2721 } 2722 // Try characters up to Kana and miscellaneous CJK but below Han (for expediency). 2723 for(int c=0; c<=0x33ff; ++c) { 2724 boolean isStarter=impl.isCanonSegmentStarter(c); 2725 boolean isContained=segStarters.contains(c); 2726 if(isStarter!=isContained) { 2727 errln(String.format( 2728 "discrepancy: isCanonSegmentStarter(U+%04x)=%5b != " + 2729 "[:Segment_Starter:].contains(same)", 2730 c, isStarter)); 2731 } 2732 } 2733 } 2734 2735 @Test TestFilteredNormalizer2()2736 public void TestFilteredNormalizer2() { 2737 Normalizer2 nfcNorm2=Normalizer2.getNFCInstance(); 2738 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]"); 2739 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2740 int c; 2741 for(c=0; c<=0x3ff; ++c) { 2742 int expectedCC= filter.contains(c) ? nfcNorm2.getCombiningClass(c) : 0; 2743 int cc=fn2.getCombiningClass(c); 2744 assertEquals( 2745 "FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+"+hex(c)+ 2746 ")==filtered NFC.getCC()", 2747 expectedCC, cc); 2748 } 2749 2750 // More coverage. 2751 StringBuilder sb=new StringBuilder(); 2752 assertEquals("filtered normalize()", "ää\u0304", 2753 fn2.normalize("a\u0308ä\u0304", (Appendable)sb).toString()); 2754 assertTrue("filtered hasBoundaryAfter()", fn2.hasBoundaryAfter('ä')); 2755 assertTrue("filtered isInert()", fn2.isInert(0x0313)); 2756 } 2757 2758 @Test TestFilteredAppend()2759 public void TestFilteredAppend() { 2760 Normalizer2 nfcNorm2=Normalizer2.getNFCInstance(); 2761 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]"); 2762 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2763 2764 // Append two strings that each contain a character outside the filter set. 2765 StringBuilder sb = new StringBuilder("a\u0313a"); 2766 String second = "\u0301\u0313"; 2767 assertEquals("append()", "a\u0313á\u0313", fn2.append(sb, second).toString()); 2768 2769 // Same, and also normalize the second string. 2770 sb.replace(0, 0x7fffffff, "a\u0313a"); 2771 assertEquals( 2772 "normalizeSecondAndAppend()", 2773 "a\u0313á\u0313", fn2.normalizeSecondAndAppend(sb, second).toString()); 2774 2775 // Normalizer2.normalize(String) uses spanQuickCheckYes() and normalizeSecondAndAppend(). 2776 assertEquals("normalize()", "a\u0313á\u0313", fn2.normalize("a\u0313a\u0301\u0313")); 2777 } 2778 2779 @Test TestGetEasyToUseInstance()2780 public void TestGetEasyToUseInstance() { 2781 // Test input string: 2782 // U+00A0 -> <noBreak> 0020 2783 // U+00C7 0301 = 1E08 = 0043 0327 0301 2784 String in="\u00A0\u00C7\u0301"; 2785 Normalizer2 n2=Normalizer2.getNFCInstance(); 2786 String out=n2.normalize(in); 2787 assertEquals( 2788 "getNFCInstance() did not return an NFC instance " + 2789 "(normalizes to " + prettify(out) + ')', 2790 "\u00A0\u1E08", out); 2791 2792 n2=Normalizer2.getNFDInstance(); 2793 out=n2.normalize(in); 2794 assertEquals( 2795 "getNFDInstance() did not return an NFD instance " + 2796 "(normalizes to " + prettify(out) + ')', 2797 "\u00A0C\u0327\u0301", out); 2798 2799 n2=Normalizer2.getNFKCInstance(); 2800 out=n2.normalize(in); 2801 assertEquals( 2802 "getNFKCInstance() did not return an NFKC instance " + 2803 "(normalizes to " + prettify(out) + ')', 2804 " \u1E08", out); 2805 2806 n2=Normalizer2.getNFKDInstance(); 2807 out=n2.normalize(in); 2808 assertEquals( 2809 "getNFKDInstance() did not return an NFKD instance " + 2810 "(normalizes to " + prettify(out) + ')', 2811 " C\u0327\u0301", out); 2812 2813 n2=Normalizer2.getNFKCCasefoldInstance(); 2814 out=n2.normalize(in); 2815 assertEquals( 2816 "getNFKCCasefoldInstance() did not return an NFKC_Casefold instance " + 2817 "(normalizes to " + prettify(out) + ')', 2818 " \u1E09", out); 2819 } 2820 2821 @Test TestLowMappingToEmpty_D()2822 public void TestLowMappingToEmpty_D() { 2823 Normalizer2 n2 = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.DECOMPOSE); 2824 checkLowMappingToEmpty(n2); 2825 2826 String sh = "\u00AD"; 2827 assertFalse("soft hyphen is not normalized", n2.isNormalized(sh)); 2828 String result = n2.normalize(sh); 2829 assertTrue("soft hyphen normalizes to empty", result.isEmpty()); 2830 assertEquals("soft hyphen QC=No", Normalizer.NO, n2.quickCheck(sh)); 2831 assertEquals("soft hyphen spanQuickCheckYes", 0, n2.spanQuickCheckYes(sh)); 2832 2833 String s = "\u00ADÄ\u00AD\u0323"; 2834 result = n2.normalize(s); 2835 assertEquals("normalize string with soft hyphens", "a\u0323\u0308", result); 2836 } 2837 2838 @Test TestLowMappingToEmpty_FCD()2839 public void TestLowMappingToEmpty_FCD() { 2840 Normalizer2 n2 = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.FCD); 2841 checkLowMappingToEmpty(n2); 2842 2843 String sh = "\u00AD"; 2844 assertTrue("soft hyphen is FCD", n2.isNormalized(sh)); 2845 2846 String s = "\u00ADÄ\u00AD\u0323"; 2847 String result = n2.normalize(s); 2848 assertEquals("normalize string with soft hyphens", "\u00ADa\u0323\u0308", result); 2849 } 2850 checkLowMappingToEmpty(Normalizer2 n2)2851 private void checkLowMappingToEmpty(Normalizer2 n2) { 2852 String mapping = n2.getDecomposition(0xad); 2853 assertNotNull("getDecomposition(soft hyphen)", mapping); 2854 assertTrue("soft hyphen maps to empty", mapping.isEmpty()); 2855 assertFalse("soft hyphen has no boundary before", n2.hasBoundaryBefore(0xad)); 2856 assertFalse("soft hyphen has no boundary after", n2.hasBoundaryAfter(0xad)); 2857 assertFalse("soft hyphen is not inert", n2.isInert(0xad)); 2858 } 2859 2860 @Test TestNormalizeIllFormedText()2861 public void TestNormalizeIllFormedText() { 2862 Normalizer2 nfkc_cf = Normalizer2.getNFKCCasefoldInstance(); 2863 // Normalization behavior for ill-formed text is not defined. 2864 // ICU currently treats ill-formed sequences as normalization-inert 2865 // and copies them unchanged. 2866 String src = " A\uD800ÄA\u0308\uD900A\u0308\u00ad\u0323\uDBFFÄ\u0323," + 2867 "\u00ad\uDC00\u1100\u1161가\u11A8가\u3133 \uDFFF"; 2868 String expected = " a\uD800ää\uD900ạ\u0308\uDBFFạ\u0308,\uDC00가객갃 \uDFFF"; 2869 String result = nfkc_cf.normalize(src); 2870 assertEquals("normalize", expected, result); 2871 } 2872 2873 @Test TestComposeJamoTBase()2874 public void TestComposeJamoTBase() { 2875 // Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7 2876 // which is not a conjoining Jamo Trailing consonant. 2877 Normalizer2 nfkc = Normalizer2.getNFKCInstance(); 2878 String s = "\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7"; 2879 String expected = "가\u11A7가\u11A7가\u11A7"; 2880 String result = nfkc.normalize(s); 2881 assertEquals("normalize(LV+11A7)", expected, result); 2882 assertFalse("isNormalized(LV+11A7)", nfkc.isNormalized(s)); 2883 assertTrue("isNormalized(normalized)", nfkc.isNormalized(result)); 2884 } 2885 2886 @Test TestComposeBoundaryAfter()2887 public void TestComposeBoundaryAfter() { 2888 Normalizer2 nfkc = Normalizer2.getNFKCInstance(); 2889 // U+02DA and U+FB2C do not have compose-boundaries-after. 2890 String s = "\u02DA\u0339 \uFB2C\u05B6"; 2891 String expected = " \u0339\u030A \u05E9\u05B6\u05BC\u05C1"; 2892 String result = nfkc.normalize(s); 2893 assertEquals("nfkc", expected, result); 2894 assertFalse("U+02DA boundary-after", nfkc.hasBoundaryAfter(0x2DA)); 2895 assertFalse("U+FB2C boundary-after", nfkc.hasBoundaryAfter(0xFB2C)); 2896 } 2897 2898 @Test TestNFC()2899 public void TestNFC() { 2900 // Coverage tests. 2901 Normalizer2 nfc = Normalizer2.getNFCInstance(); 2902 assertTrue("nfc.hasBoundaryAfter(space)", nfc.hasBoundaryAfter(' ')); 2903 assertFalse("nfc.hasBoundaryAfter(ä)", nfc.hasBoundaryAfter('ä')); 2904 } 2905 2906 @Test TestNFD()2907 public void TestNFD() { 2908 // Coverage tests. 2909 Normalizer2 nfd = Normalizer2.getNFDInstance(); 2910 assertTrue("nfd.hasBoundaryAfter(space)", nfd.hasBoundaryAfter(' ')); 2911 assertFalse("nfd.hasBoundaryAfter(ä)", nfd.hasBoundaryAfter('ä')); 2912 } 2913 2914 @Test TestFCD()2915 public void TestFCD() { 2916 // Coverage tests. 2917 Normalizer2 fcd = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.FCD); 2918 assertTrue("fcd.hasBoundaryAfter(space)", fcd.hasBoundaryAfter(' ')); 2919 assertFalse("fcd.hasBoundaryAfter(ä)", fcd.hasBoundaryAfter('ä')); 2920 assertTrue("fcd.isInert(space)", fcd.isInert(' ')); 2921 assertFalse("fcd.isInert(ä)", fcd.isInert('ä')); 2922 2923 // This implementation method is unreachable via public API. 2924 Norm2AllModes.FCDNormalizer2 impl = (Norm2AllModes.FCDNormalizer2)fcd; 2925 assertEquals("fcd impl.getQuickCheck(space)", 1, impl.getQuickCheck(' ')); 2926 assertEquals("fcd impl.getQuickCheck(ä)", 0, impl.getQuickCheck('ä')); 2927 } 2928 2929 @Test TestNoneNormalizer()2930 public void TestNoneNormalizer() { 2931 // Use the deprecated Mode Normalizer.NONE for coverage of the internal NoopNormalizer2 2932 // as far as its methods are reachable that way. 2933 assertEquals("NONE.concatenate()", "ä\u0327", 2934 Normalizer.concatenate("ä", "\u0327", Normalizer.NONE, 0)); 2935 assertTrue("NONE.isNormalized()", Normalizer.isNormalized("ä\u0327", Normalizer.NONE, 0)); 2936 } 2937 2938 @Test TestNoopNormalizer2()2939 public void TestNoopNormalizer2() { 2940 // Use the internal class directly for coverage of methods that are not publicly reachable. 2941 Normalizer2 noop = Norm2AllModes.NOOP_NORMALIZER2; 2942 assertEquals("noop.normalizeSecondAndAppend()", "ä\u0327", 2943 noop.normalizeSecondAndAppend(new StringBuilder("ä"), "\u0327").toString()); 2944 assertEquals("noop.getDecomposition()", null, noop.getDecomposition('ä')); 2945 assertTrue("noop.hasBoundaryAfter()", noop.hasBoundaryAfter(0x0308)); 2946 assertTrue("noop.isInert()", noop.isInert(0x0308)); 2947 } 2948 2949 /* 2950 * Abstract class Normalizer2 has non-abstract methods which are overwritten by 2951 * its derived classes. To test these methods a derived class is defined here. 2952 */ 2953 public class TestNormalizer2 extends Normalizer2 { 2954 TestNormalizer2()2955 public TestNormalizer2() {} 2956 @Override normalize(CharSequence src, StringBuilder dest)2957 public StringBuilder normalize(CharSequence src, StringBuilder dest) { return null; } 2958 @Override normalize(CharSequence src, Appendable dest)2959 public Appendable normalize(CharSequence src, Appendable dest) { return null; } 2960 @Override normalizeSecondAndAppend( StringBuilder first, CharSequence second)2961 public StringBuilder normalizeSecondAndAppend( 2962 StringBuilder first, CharSequence second) { return null; } 2963 @Override append(StringBuilder first, CharSequence second)2964 public StringBuilder append(StringBuilder first, CharSequence second) { return null; } 2965 @Override getDecomposition(int c)2966 public String getDecomposition(int c) { return null; } 2967 @Override isNormalized(CharSequence s)2968 public boolean isNormalized(CharSequence s) { return false; } 2969 @Override quickCheck(CharSequence s)2970 public Normalizer.QuickCheckResult quickCheck(CharSequence s) { return null; } 2971 @Override spanQuickCheckYes(CharSequence s)2972 public int spanQuickCheckYes(CharSequence s) { return 0; } 2973 @Override hasBoundaryBefore(int c)2974 public boolean hasBoundaryBefore(int c) { return false; } 2975 @Override hasBoundaryAfter(int c)2976 public boolean hasBoundaryAfter(int c) { return false; } 2977 @Override isInert(int c)2978 public boolean isInert(int c) { return false; } 2979 } 2980 2981 final TestNormalizer2 tnorm2 = new TestNormalizer2(); 2982 @Test TestGetRawDecompositionBase()2983 public void TestGetRawDecompositionBase() { 2984 int c = 'à'; 2985 assertEquals("Unexpected value returned from Normalizer2.getRawDecomposition()", 2986 null, tnorm2.getRawDecomposition(c)); 2987 } 2988 2989 @Test TestComposePairBase()2990 public void TestComposePairBase() { 2991 int a = 'a'; 2992 int b = '\u0300'; 2993 assertEquals("Unexpected value returned from Normalizer2.composePair()", 2994 -1, tnorm2.composePair(a, b)); 2995 } 2996 2997 @Test TestGetCombiningClassBase()2998 public void TestGetCombiningClassBase() { 2999 int c = '\u00e0'; 3000 assertEquals("Unexpected value returned from Normalizer2.getCombiningClass()", 3001 0, tnorm2.getCombiningClass(c)); 3002 } 3003 } 3004