1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2000-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.text; 11 import java.nio.CharBuffer; 12 import java.text.CharacterIterator; 13 14 import ohos.global.icu.impl.Norm2AllModes; 15 import ohos.global.icu.impl.Normalizer2Impl; 16 import ohos.global.icu.impl.UCaseProps; 17 import ohos.global.icu.lang.UCharacter; 18 import ohos.global.icu.util.ICUCloneNotSupportedException; 19 20 /** 21 * Old Unicode normalization API. 22 * 23 * <p>This API has been replaced by the {@link Normalizer2} class and is only available 24 * for backward compatibility. This class simply delegates to the Normalizer2 class. 25 * There are two exceptions: The new API does not provide a replacement for 26 * <code>QuickCheckResult</code> and <code>compare()</code>. 27 * 28 * <p><code>normalize</code> transforms Unicode text into an equivalent composed or 29 * decomposed form, allowing for easier sorting and searching of text. 30 * <code>normalize</code> supports the standard normalization forms described in 31 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> 32 * Unicode Standard Annex #15 — Unicode Normalization Forms</a>. 33 * 34 * <p>Characters with accents or other adornments can be encoded in 35 * several different ways in Unicode. For example, take the character A-acute. 36 * In Unicode, this can be encoded as a single character (the 37 * "composed" form): 38 * 39 * <pre> 40 * 00C1 LATIN CAPITAL LETTER A WITH ACUTE 41 * </pre> 42 * 43 * or as two separate characters (the "decomposed" form): 44 * 45 * <pre> 46 * 0041 LATIN CAPITAL LETTER A 47 * 0301 COMBINING ACUTE ACCENT 48 * </pre> 49 * 50 * <p>To a user of your program, however, both of these sequences should be 51 * treated as the same "user-level" character "A with acute accent". When you 52 * are searching or comparing text, you must ensure that these two sequences are 53 * treated equivalently. In addition, you must handle characters with more than 54 * one accent. Sometimes the order of a character's combining accents is 55 * significant, while in other cases accent sequences in different orders are 56 * really equivalent. 57 * 58 * <p>Similarly, the string "ffi" can be encoded as three separate letters: 59 * 60 * <pre> 61 * 0066 LATIN SMALL LETTER F 62 * 0066 LATIN SMALL LETTER F 63 * 0069 LATIN SMALL LETTER I 64 * </pre> 65 * 66 * or as the single character 67 * 68 * <pre> 69 * FB03 LATIN SMALL LIGATURE FFI 70 * </pre> 71 * 72 * <p>The ffi ligature is not a distinct semantic character, and strictly speaking 73 * it shouldn't be in Unicode at all, but it was included for compatibility 74 * with existing character sets that already provided it. The Unicode standard 75 * identifies such characters by giving them "compatibility" decompositions 76 * into the corresponding semantic characters. When sorting and searching, you 77 * will often want to use these mappings. 78 * 79 * <p><code>normalize</code> helps solve these problems by transforming text into 80 * the canonical composed and decomposed forms as shown in the first example 81 * above. In addition, you can have it perform compatibility decompositions so 82 * that you can treat compatibility characters the same as their equivalents. 83 * Finally, <code>normalize</code> rearranges accents into the proper canonical 84 * order, so that you do not have to worry about accent rearrangement on your 85 * own. 86 * 87 * <p>Form FCD, "Fast C or D", is also designed for collation. 88 * It allows to work on strings that are not necessarily normalized 89 * with an algorithm (like in collation) that works under "canonical closure", 90 * i.e., it treats precomposed characters and their decomposed equivalents the 91 * same. 92 * 93 * <p>It is not a normalization form because it does not provide for uniqueness of 94 * representation. Multiple strings may be canonically equivalent (their NFDs 95 * are identical) and may all conform to FCD without being identical themselves. 96 * 97 * <p>The form is defined such that the "raw decomposition", the recursive 98 * canonical decomposition of each character, results in a string that is 99 * canonically ordered. This means that precomposed characters are allowed for 100 * as long as their decompositions do not need canonical reordering. 101 * 102 * <p>Its advantage for a process like collation is that all NFD and most NFC texts 103 * - and many unnormalized texts - already conform to FCD and do not need to be 104 * normalized (NFD) for such a process. The FCD quick check will return YES for 105 * most strings in practice. 106 * 107 * <p>normalize(FCD) may be implemented with NFD. 108 * 109 * <p>For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): 110 * http://www.unicode.org/notes/tn5/#FCD 111 * 112 * <p>ICU collation performs either NFD or FCD normalization automatically if 113 * normalization is turned on for the collator object. Beyond collation and 114 * string search, normalized strings may be useful for string equivalence 115 * comparisons, transliteration/transcription, unique representations, etc. 116 * 117 * <p>The W3C generally recommends to exchange texts in NFC. 118 * Note also that most legacy character encodings use only precomposed forms and 119 * often do not encode any combining marks by themselves. For conversion to such 120 * character encodings the Unicode text needs to be normalized to NFC. 121 * For more usage examples, see the Unicode Standard Annex. 122 * 123 * <p>Note: The Normalizer class also provides API for iterative normalization. 124 * While the setIndex() and getIndex() refer to indices in the 125 * underlying Unicode input text, the next() and previous() methods 126 * iterate through characters in the normalized output. 127 * This means that there is not necessarily a one-to-one correspondence 128 * between characters returned by next() and previous() and the indices 129 * passed to and returned from setIndex() and getIndex(). 130 * It is for this reason that Normalizer does not implement the CharacterIterator interface. 131 */ 132 public final class Normalizer implements Cloneable { 133 // The input text and our position in it 134 private UCharacterIterator text; 135 private Normalizer2 norm2; 136 private Mode mode; 137 private int options; 138 139 // The normalization buffer is the result of normalization 140 // of the source in [currentIndex..nextIndex[ . 141 private int currentIndex; 142 private int nextIndex; 143 144 // A buffer for holding intermediate results 145 private StringBuilder buffer; 146 private int bufferPos; 147 148 // Helper classes to defer loading of normalization data. 149 private static final class ModeImpl { ModeImpl(Normalizer2 n2)150 private ModeImpl(Normalizer2 n2) { 151 normalizer2 = n2; 152 } 153 private final Normalizer2 normalizer2; 154 } 155 private static final class NFDModeImpl { 156 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); 157 } 158 private static final class NFKDModeImpl { 159 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); 160 } 161 private static final class NFCModeImpl { 162 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); 163 } 164 private static final class NFKCModeImpl { 165 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); 166 } 167 private static final class FCDModeImpl { 168 private static final ModeImpl INSTANCE = new ModeImpl(Norm2AllModes.getFCDNormalizer2()); 169 } 170 171 private static final class Unicode32 { 172 private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); 173 } 174 private static final class NFD32ModeImpl { 175 private static final ModeImpl INSTANCE = 176 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), 177 Unicode32.INSTANCE)); 178 } 179 private static final class NFKD32ModeImpl { 180 private static final ModeImpl INSTANCE = 181 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), 182 Unicode32.INSTANCE)); 183 } 184 private static final class NFC32ModeImpl { 185 private static final ModeImpl INSTANCE = 186 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), 187 Unicode32.INSTANCE)); 188 } 189 private static final class NFKC32ModeImpl { 190 private static final ModeImpl INSTANCE = 191 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), 192 Unicode32.INSTANCE)); 193 } 194 private static final class FCD32ModeImpl { 195 private static final ModeImpl INSTANCE = 196 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(), 197 Unicode32.INSTANCE)); 198 } 199 200 /** 201 * Options bit set value to select Unicode 3.2 normalization 202 * (except NormalizationCorrections). 203 * At most one Unicode version can be selected at a time. 204 * 205 * @deprecated ICU 56 Use {@link FilteredNormalizer2} instead. 206 * @hide deprecated on icu4j-org 207 */ 208 @Deprecated 209 public static final int UNICODE_3_2=0x20; 210 211 /** 212 * Constant indicating that the end of the iteration has been reached. 213 * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. 214 * 215 * @deprecated ICU 56 216 * @hide deprecated on icu4j-org 217 */ 218 @Deprecated 219 public static final int DONE = UCharacterIterator.DONE; 220 221 /** 222 * Constants for normalization modes. 223 * <p> 224 * The Mode class is not intended for public subclassing. 225 * Only the Mode constants provided by the Normalizer class should be used, 226 * and any fields or methods should not be called or overridden by users. 227 * 228 * @deprecated ICU 56 Use {@link Normalizer2} instead. 229 * @hide exposed on OHOS 230 * @hide deprecated on icu4j-org 231 */ 232 @Deprecated 233 public static abstract class Mode { 234 /** 235 * Sole constructor 236 * @deprecated This API is ICU internal only. 237 * @hide deprecated on icu4j-org 238 * @hide draft / provisional / internal are hidden on OHOS 239 */ 240 @Deprecated Mode()241 protected Mode() { 242 } 243 244 /** 245 * @deprecated This API is ICU internal only. 246 * @hide deprecated on icu4j-org 247 * @hide draft / provisional / internal are hidden on OHOS 248 */ 249 @Deprecated getNormalizer2(int options)250 protected abstract Normalizer2 getNormalizer2(int options); 251 } 252 253 private static final class NONEMode extends Mode { 254 @Override getNormalizer2(int options)255 protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } 256 } 257 private static final class NFDMode extends Mode { 258 @Override getNormalizer2(int options)259 protected Normalizer2 getNormalizer2(int options) { 260 return (options&UNICODE_3_2) != 0 ? 261 NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2; 262 } 263 } 264 private static final class NFKDMode extends Mode { 265 @Override getNormalizer2(int options)266 protected Normalizer2 getNormalizer2(int options) { 267 return (options&UNICODE_3_2) != 0 ? 268 NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2; 269 } 270 } 271 private static final class NFCMode extends Mode { 272 @Override getNormalizer2(int options)273 protected Normalizer2 getNormalizer2(int options) { 274 return (options&UNICODE_3_2) != 0 ? 275 NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2; 276 } 277 } 278 private static final class NFKCMode extends Mode { 279 @Override getNormalizer2(int options)280 protected Normalizer2 getNormalizer2(int options) { 281 return (options&UNICODE_3_2) != 0 ? 282 NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2; 283 } 284 } 285 private static final class FCDMode extends Mode { 286 @Override getNormalizer2(int options)287 protected Normalizer2 getNormalizer2(int options) { 288 return (options&UNICODE_3_2) != 0 ? 289 FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2; 290 } 291 } 292 293 /** 294 * No decomposition/composition. 295 * 296 * @deprecated ICU 56 Use {@link Normalizer2} instead. 297 * @hide deprecated on icu4j-org 298 */ 299 @Deprecated 300 public static final Mode NONE = new NONEMode(); 301 302 /** 303 * Canonical decomposition. 304 * 305 * @deprecated ICU 56 Use {@link Normalizer2} instead. 306 * @hide deprecated on icu4j-org 307 */ 308 @Deprecated 309 public static final Mode NFD = new NFDMode(); 310 311 /** 312 * Compatibility decomposition. 313 * 314 * @deprecated ICU 56 Use {@link Normalizer2} instead. 315 * @hide deprecated on icu4j-org 316 */ 317 @Deprecated 318 public static final Mode NFKD = new NFKDMode(); 319 320 /** 321 * Canonical decomposition followed by canonical composition. 322 * 323 * @deprecated ICU 56 Use {@link Normalizer2} instead. 324 * @hide deprecated on icu4j-org 325 */ 326 @Deprecated 327 public static final Mode NFC = new NFCMode(); 328 329 /** 330 * Default normalization. 331 * 332 * @deprecated ICU 56 Use {@link Normalizer2} instead. 333 * @hide deprecated on icu4j-org 334 */ 335 @Deprecated 336 public static final Mode DEFAULT = NFC; 337 338 /** 339 * Compatibility decomposition followed by canonical composition. 340 * 341 * @deprecated ICU 56 Use {@link Normalizer2} instead. 342 * @hide deprecated on icu4j-org 343 */ 344 @Deprecated 345 public static final Mode NFKC =new NFKCMode(); 346 347 /** 348 * "Fast C or D" form. 349 * 350 * @deprecated ICU 56 Use {@link Normalizer2} instead. 351 * @hide deprecated on icu4j-org 352 */ 353 @Deprecated 354 public static final Mode FCD = new FCDMode(); 355 356 /** 357 * Null operation for use with the {@link ohos.global.icu.text.Normalizer constructors} 358 * and the static {@link #normalize normalize} method. This value tells 359 * the <tt>Normalizer</tt> to do nothing but return unprocessed characters 360 * from the underlying String or CharacterIterator. If you have code which 361 * requires raw text at some times and normalized text at others, you can 362 * use <tt>NO_OP</tt> for the cases where you want raw text, rather 363 * than having a separate code path that bypasses <tt>Normalizer</tt> 364 * altogether. 365 * <p> 366 * @see #setMode 367 * @deprecated ICU 2.8. Use Nomalizer.NONE 368 * @see #NONE 369 * @hide deprecated on icu4j-org 370 */ 371 @Deprecated 372 public static final Mode NO_OP = NONE; 373 374 /** 375 * Canonical decomposition followed by canonical composition. Used with the 376 * {@link ohos.global.icu.text.Normalizer constructors} and the static 377 * {@link #normalize normalize} method to determine the operation to be 378 * performed. 379 * <p> 380 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 381 * off, this operation produces output that is in 382 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 383 * Form</a> 384 * <b>C</b>. 385 * <p> 386 * @see #setMode 387 * @deprecated ICU 2.8. Use Normalier.NFC 388 * @see #NFC 389 * @hide deprecated on icu4j-org 390 */ 391 @Deprecated 392 public static final Mode COMPOSE = NFC; 393 394 /** 395 * Compatibility decomposition followed by canonical composition. 396 * Used with the {@link ohos.global.icu.text.Normalizer constructors} and the static 397 * {@link #normalize normalize} method to determine the operation to be 398 * performed. 399 * <p> 400 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 401 * off, this operation produces output that is in 402 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 403 * Form</a> 404 * <b>KC</b>. 405 * <p> 406 * @see #setMode 407 * @deprecated ICU 2.8. Use Normalizer.NFKC 408 * @see #NFKC 409 * @hide deprecated on icu4j-org 410 */ 411 @Deprecated 412 public static final Mode COMPOSE_COMPAT = NFKC; 413 414 /** 415 * Canonical decomposition. This value is passed to the 416 * {@link ohos.global.icu.text.Normalizer constructors} and the static 417 * {@link #normalize normalize} 418 * method to determine the operation to be performed. 419 * <p> 420 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 421 * off, this operation produces output that is in 422 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 423 * Form</a> 424 * <b>D</b>. 425 * <p> 426 * @see #setMode 427 * @deprecated ICU 2.8. Use Normalizer.NFD 428 * @see #NFD 429 * @hide deprecated on icu4j-org 430 */ 431 @Deprecated 432 public static final Mode DECOMP = NFD; 433 434 /** 435 * Compatibility decomposition. This value is passed to the 436 * {@link ohos.global.icu.text.Normalizer constructors} and the static 437 * {@link #normalize normalize} 438 * method to determine the operation to be performed. 439 * <p> 440 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 441 * off, this operation produces output that is in 442 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 443 * Form</a> 444 * <b>KD</b>. 445 * <p> 446 * @see #setMode 447 * @deprecated ICU 2.8. Use Normalizer.NFKD 448 * @see #NFKD 449 * @hide deprecated on icu4j-org 450 */ 451 @Deprecated 452 public static final Mode DECOMP_COMPAT = NFKD; 453 454 /** 455 * Option to disable Hangul/Jamo composition and decomposition. 456 * This option applies to Korean text, 457 * which can be represented either in the Jamo alphabet or in Hangul 458 * characters, which are really just two or three Jamo combined 459 * into one visual glyph. Since Jamo takes up more storage space than 460 * Hangul, applications that process only Hangul text may wish to turn 461 * this option on when decomposing text. 462 * <p> 463 * The Unicode standard treates Hangul to Jamo conversion as a 464 * canonical decomposition, so this option must be turned <b>off</b> if you 465 * wish to transform strings into one of the standard 466 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> 467 * Unicode Normalization Forms</a>. 468 * <p> 469 * @see #setOption 470 * @deprecated ICU 2.8. This option is no longer supported. 471 * @hide deprecated on icu4j-org 472 */ 473 @Deprecated 474 public static final int IGNORE_HANGUL = 0x0001; 475 476 /** 477 * Result values for quickCheck(). 478 * For details see Unicode Technical Report 15. 479 */ 480 public static final class QuickCheckResult{ 481 //private int resultValue; QuickCheckResult(int value)482 private QuickCheckResult(int value) { 483 //resultValue=value; 484 } 485 } 486 /** 487 * Indicates that string is not in the normalized format 488 */ 489 public static final QuickCheckResult NO = new QuickCheckResult(0); 490 491 /** 492 * Indicates that string is in the normalized format 493 */ 494 public static final QuickCheckResult YES = new QuickCheckResult(1); 495 496 /** 497 * Indicates it cannot be determined if string is in the normalized 498 * format without further thorough checks. 499 */ 500 public static final QuickCheckResult MAYBE = new QuickCheckResult(2); 501 502 /** 503 * Option bit for compare: 504 * Case sensitively compare the strings 505 */ 506 public static final int FOLD_CASE_DEFAULT = UCharacter.FOLD_CASE_DEFAULT; 507 508 /** 509 * Option bit for compare: 510 * Both input strings are assumed to fulfill FCD conditions. 511 */ 512 public static final int INPUT_IS_FCD = 0x20000; 513 514 /** 515 * Option bit for compare: 516 * Perform case-insensitive comparison. 517 */ 518 public static final int COMPARE_IGNORE_CASE = 0x10000; 519 520 /** 521 * Option bit for compare: 522 * Compare strings in code point order instead of code unit order. 523 */ 524 public static final int COMPARE_CODE_POINT_ORDER = 0x8000; 525 526 /** 527 * Option value for case folding: 528 * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I 529 * and dotless i appropriately for Turkic languages (tr, az). 530 * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I 531 */ 532 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I; 533 534 /** 535 * Lowest-order bit number of compare() options bits corresponding to 536 * normalization options bits. 537 * 538 * The options parameter for compare() uses most bits for 539 * itself and for various comparison and folding flags. 540 * The most significant bits, however, are shifted down and passed on 541 * to the normalization implementation. 542 * (That is, from compare(..., options, ...), 543 * options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the 544 * internal normalization functions.) 545 * 546 * @see #compare 547 * @deprecated ICU 56 Use {@link Normalizer2} instead. 548 * @hide deprecated on icu4j-org 549 */ 550 @Deprecated 551 public static final int COMPARE_NORM_OPTIONS_SHIFT = 20; 552 553 //------------------------------------------------------------------------- 554 // Iterator constructors 555 //------------------------------------------------------------------------- 556 557 /** 558 * Creates a new <tt>Normalizer</tt> object for iterating over the 559 * normalized form of a given string. 560 * <p> 561 * The <tt>options</tt> parameter specifies which optional 562 * <tt>Normalizer</tt> features are to be enabled for this object. 563 * <p> 564 * @param str The string to be normalized. The normalization 565 * will start at the beginning of the string. 566 * 567 * @param mode The normalization mode. 568 * 569 * @param opt Any optional features to be enabled. 570 * Currently the only available option is {@link #UNICODE_3_2}. 571 * If you want the default behavior corresponding to one of the 572 * standard Unicode Normalization Forms, use 0 for this argument. 573 * @deprecated ICU 56 Use {@link Normalizer2} instead. 574 * @hide deprecated on icu4j-org 575 */ 576 @Deprecated Normalizer(String str, Mode mode, int opt)577 public Normalizer(String str, Mode mode, int opt) { 578 this.text = UCharacterIterator.getInstance(str); 579 this.mode = mode; 580 this.options=opt; 581 norm2 = mode.getNormalizer2(opt); 582 buffer = new StringBuilder(); 583 } 584 585 /** 586 * Creates a new <tt>Normalizer</tt> object for iterating over the 587 * normalized form of the given text. 588 * <p> 589 * @param iter The input text to be normalized. The normalization 590 * will start at the beginning of the string. 591 * 592 * @param mode The normalization mode. 593 * 594 * @param opt Any optional features to be enabled. 595 * Currently the only available option is {@link #UNICODE_3_2}. 596 * If you want the default behavior corresponding to one of the 597 * standard Unicode Normalization Forms, use 0 for this argument. 598 * @deprecated ICU 56 Use {@link Normalizer2} instead. 599 * @hide deprecated on icu4j-org 600 */ 601 @Deprecated Normalizer(CharacterIterator iter, Mode mode, int opt)602 public Normalizer(CharacterIterator iter, Mode mode, int opt) { 603 this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); 604 this.mode = mode; 605 this.options = opt; 606 norm2 = mode.getNormalizer2(opt); 607 buffer = new StringBuilder(); 608 } 609 610 /** 611 * Creates a new <tt>Normalizer</tt> object for iterating over the 612 * normalized form of the given text. 613 * <p> 614 * @param iter The input text to be normalized. The normalization 615 * will start at the beginning of the string. 616 * 617 * @param mode The normalization mode. 618 * @param options The normalization options, ORed together (0 for no options). 619 * @deprecated ICU 56 Use {@link Normalizer2} instead. 620 * @hide deprecated on icu4j-org 621 */ 622 @Deprecated Normalizer(UCharacterIterator iter, Mode mode, int options)623 public Normalizer(UCharacterIterator iter, Mode mode, int options) { 624 try { 625 this.text = (UCharacterIterator)iter.clone(); 626 this.mode = mode; 627 this.options = options; 628 norm2 = mode.getNormalizer2(options); 629 buffer = new StringBuilder(); 630 } catch (CloneNotSupportedException e) { 631 throw new ICUCloneNotSupportedException(e); 632 } 633 } 634 635 /** 636 * Clones this <tt>Normalizer</tt> object. All properties of this 637 * object are duplicated in the new object, including the cloning of any 638 * {@link CharacterIterator} that was passed in to the constructor 639 * or to {@link #setText(CharacterIterator) setText}. 640 * However, the text storage underlying 641 * the <tt>CharacterIterator</tt> is not duplicated unless the 642 * iterator's <tt>clone</tt> method does so. 643 * 644 * @deprecated ICU 56 Use {@link Normalizer2} instead. 645 * @hide deprecated on icu4j-org 646 */ 647 @Deprecated 648 @Override clone()649 public Object clone() { 650 try { 651 Normalizer copy = (Normalizer) super.clone(); 652 copy.text = (UCharacterIterator) text.clone(); 653 copy.mode = mode; 654 copy.options = options; 655 copy.norm2 = norm2; 656 copy.buffer = new StringBuilder(buffer); 657 copy.bufferPos = bufferPos; 658 copy.currentIndex = currentIndex; 659 copy.nextIndex = nextIndex; 660 return copy; 661 } 662 catch (CloneNotSupportedException e) { 663 throw new ICUCloneNotSupportedException(e); 664 } 665 } 666 667 //-------------------------------------------------------------------------- 668 // Static Utility methods 669 //-------------------------------------------------------------------------- 670 getComposeNormalizer2(boolean compat, int options)671 private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) { 672 return (compat ? NFKC : NFC).getNormalizer2(options); 673 } getDecomposeNormalizer2(boolean compat, int options)674 private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) { 675 return (compat ? NFKD : NFD).getNormalizer2(options); 676 } 677 678 /** 679 * Compose a string. 680 * The string will be composed to according to the specified mode. 681 * @param str The string to compose. 682 * @param compat If true the string will be composed according to 683 * NFKC rules and if false will be composed according to 684 * NFC rules. 685 * @return String The composed string 686 * @deprecated ICU 56 Use {@link Normalizer2} instead. 687 * @hide deprecated on icu4j-org 688 */ 689 @Deprecated compose(String str, boolean compat)690 public static String compose(String str, boolean compat) { 691 return compose(str,compat,0); 692 } 693 694 /** 695 * Compose a string. 696 * The string will be composed to according to the specified mode. 697 * @param str The string to compose. 698 * @param compat If true the string will be composed according to 699 * NFKC rules and if false will be composed according to 700 * NFC rules. 701 * @param options The only recognized option is UNICODE_3_2 702 * @return String The composed string 703 * @deprecated ICU 56 Use {@link Normalizer2} instead. 704 * @hide deprecated on icu4j-org 705 */ 706 @Deprecated compose(String str, boolean compat, int options)707 public static String compose(String str, boolean compat, int options) { 708 return getComposeNormalizer2(compat, options).normalize(str); 709 } 710 711 /** 712 * Compose a string. 713 * The string will be composed to according to the specified mode. 714 * @param source The char array to compose. 715 * @param target A char buffer to receive the normalized text. 716 * @param compat If true the char array will be composed according to 717 * NFKC rules and if false will be composed according to 718 * NFC rules. 719 * @param options The normalization options, ORed together (0 for no options). 720 * @return int The total buffer size needed;if greater than length of 721 * result, the output was truncated. 722 * @exception IndexOutOfBoundsException if target.length is less than the 723 * required length 724 * @deprecated ICU 56 Use {@link Normalizer2} instead. 725 * @hide deprecated on icu4j-org 726 */ 727 @Deprecated compose(char[] source,char[] target, boolean compat, int options)728 public static int compose(char[] source,char[] target, boolean compat, int options) { 729 return compose(source, 0, source.length, target, 0, target.length, compat, options); 730 } 731 732 /** 733 * Compose a string. 734 * The string will be composed to according to the specified mode. 735 * @param src The char array to compose. 736 * @param srcStart Start index of the source 737 * @param srcLimit Limit index of the source 738 * @param dest The char buffer to fill in 739 * @param destStart Start index of the destination buffer 740 * @param destLimit End index of the destination buffer 741 * @param compat If true the char array will be composed according to 742 * NFKC rules and if false will be composed according to 743 * NFC rules. 744 * @param options The normalization options, ORed together (0 for no options). 745 * @return int The total buffer size needed;if greater than length of 746 * result, the output was truncated. 747 * @exception IndexOutOfBoundsException if target.length is less than the 748 * required length 749 * @deprecated ICU 56 Use {@link Normalizer2} instead. 750 * @hide deprecated on icu4j-org 751 */ 752 @Deprecated compose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)753 public static int compose(char[] src,int srcStart, int srcLimit, 754 char[] dest,int destStart, int destLimit, 755 boolean compat, int options) { 756 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 757 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 758 getComposeNormalizer2(compat, options).normalize(srcBuffer, app); 759 return app.length(); 760 } 761 762 /** 763 * Decompose a string. 764 * The string will be decomposed to according to the specified mode. 765 * @param str The string to decompose. 766 * @param compat If true the string will be decomposed according to NFKD 767 * rules and if false will be decomposed according to NFD 768 * rules. 769 * @return String The decomposed string 770 * @deprecated ICU 56 Use {@link Normalizer2} instead. 771 * @hide deprecated on icu4j-org 772 */ 773 @Deprecated decompose(String str, boolean compat)774 public static String decompose(String str, boolean compat) { 775 return decompose(str,compat,0); 776 } 777 778 /** 779 * Decompose a string. 780 * The string will be decomposed to according to the specified mode. 781 * @param str The string to decompose. 782 * @param compat If true the string will be decomposed according to NFKD 783 * rules and if false will be decomposed according to NFD 784 * rules. 785 * @param options The normalization options, ORed together (0 for no options). 786 * @return String The decomposed string 787 * @deprecated ICU 56 Use {@link Normalizer2} instead. 788 * @hide deprecated on icu4j-org 789 */ 790 @Deprecated decompose(String str, boolean compat, int options)791 public static String decompose(String str, boolean compat, int options) { 792 return getDecomposeNormalizer2(compat, options).normalize(str); 793 } 794 795 /** 796 * Decompose a string. 797 * The string will be decomposed to according to the specified mode. 798 * @param source The char array to decompose. 799 * @param target A char buffer to receive the normalized text. 800 * @param compat If true the char array will be decomposed according to NFKD 801 * rules and if false will be decomposed according to 802 * NFD rules. 803 * @return int The total buffer size needed;if greater than length of 804 * result,the output was truncated. 805 * @param options The normalization options, ORed together (0 for no options). 806 * @exception IndexOutOfBoundsException if the target capacity is less than 807 * the required length 808 * @deprecated ICU 56 Use {@link Normalizer2} instead. 809 * @hide deprecated on icu4j-org 810 */ 811 @Deprecated decompose(char[] source,char[] target, boolean compat, int options)812 public static int decompose(char[] source,char[] target, boolean compat, int options) { 813 return decompose(source, 0, source.length, target, 0, target.length, compat, options); 814 } 815 816 /** 817 * Decompose a string. 818 * The string will be decomposed to according to the specified mode. 819 * @param src The char array to compose. 820 * @param srcStart Start index of the source 821 * @param srcLimit Limit index of the source 822 * @param dest The char buffer to fill in 823 * @param destStart Start index of the destination buffer 824 * @param destLimit End index of the destination buffer 825 * @param compat If true the char array will be decomposed according to NFKD 826 * rules and if false will be decomposed according to 827 * NFD rules. 828 * @param options The normalization options, ORed together (0 for no options). 829 * @return int The total buffer size needed;if greater than length of 830 * result,the output was truncated. 831 * @exception IndexOutOfBoundsException if the target capacity is less than 832 * the required length 833 * @deprecated ICU 56 Use {@link Normalizer2} instead. 834 * @hide deprecated on icu4j-org 835 */ 836 @Deprecated decompose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)837 public static int decompose(char[] src,int srcStart, int srcLimit, 838 char[] dest,int destStart, int destLimit, 839 boolean compat, int options) { 840 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 841 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 842 getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app); 843 return app.length(); 844 } 845 846 /** 847 * Normalizes a <tt>String</tt> using the given normalization operation. 848 * <p> 849 * The <tt>options</tt> parameter specifies which optional 850 * <tt>Normalizer</tt> features are to be enabled for this operation. 851 * Currently the only available option is {@link #UNICODE_3_2}. 852 * If you want the default behavior corresponding to one of the standard 853 * Unicode Normalization Forms, use 0 for this argument. 854 * <p> 855 * @param str the input string to be normalized. 856 * @param mode the normalization mode 857 * @param options the optional features to be enabled. 858 * @return String the normalized string 859 * @deprecated ICU 56 Use {@link Normalizer2} instead. 860 * @hide deprecated on icu4j-org 861 */ 862 @Deprecated normalize(String str, Mode mode, int options)863 public static String normalize(String str, Mode mode, int options) { 864 return mode.getNormalizer2(options).normalize(str); 865 } 866 867 /** 868 * Normalize a string. 869 * The string will be normalized according to the specified normalization 870 * mode and options. 871 * @param src The string to normalize. 872 * @param mode The normalization mode; one of Normalizer.NONE, 873 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 874 * Normalizer.NFKD, Normalizer.DEFAULT 875 * @return the normalized string 876 * @deprecated ICU 56 Use {@link Normalizer2} instead. 877 * @hide deprecated on icu4j-org 878 */ 879 @Deprecated normalize(String src,Mode mode)880 public static String normalize(String src,Mode mode) { 881 return normalize(src, mode, 0); 882 } 883 /** 884 * Normalize a string. 885 * The string will be normalized according to the specified normalization 886 * mode and options. 887 * @param source The char array to normalize. 888 * @param target A char buffer to receive the normalized text. 889 * @param mode The normalization mode; one of Normalizer.NONE, 890 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 891 * Normalizer.NFKD, Normalizer.DEFAULT 892 * @param options The normalization options, ORed together (0 for no options). 893 * @return int The total buffer size needed;if greater than length of 894 * result, the output was truncated. 895 * @exception IndexOutOfBoundsException if the target capacity is less 896 * than the required length 897 * @deprecated ICU 56 Use {@link Normalizer2} instead. 898 * @hide deprecated on icu4j-org 899 */ 900 @Deprecated normalize(char[] source,char[] target, Mode mode, int options)901 public static int normalize(char[] source,char[] target, Mode mode, int options) { 902 return normalize(source,0,source.length,target,0,target.length,mode, options); 903 } 904 905 /** 906 * Normalize a string. 907 * The string will be normalized according to the specified normalization 908 * mode and options. 909 * @param src The char array to compose. 910 * @param srcStart Start index of the source 911 * @param srcLimit Limit index of the source 912 * @param dest The char buffer to fill in 913 * @param destStart Start index of the destination buffer 914 * @param destLimit End index of the destination buffer 915 * @param mode The normalization mode; one of Normalizer.NONE, 916 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 917 * Normalizer.NFKD, Normalizer.DEFAULT 918 * @param options The normalization options, ORed together (0 for no options). 919 * @return int The total buffer size needed;if greater than length of 920 * result, the output was truncated. 921 * @exception IndexOutOfBoundsException if the target capacity is 922 * less than the required length 923 * @deprecated ICU 56 Use {@link Normalizer2} instead. 924 * @hide deprecated on icu4j-org 925 */ 926 @Deprecated normalize(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, Mode mode, int options)927 public static int normalize(char[] src,int srcStart, int srcLimit, 928 char[] dest,int destStart, int destLimit, 929 Mode mode, int options) { 930 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 931 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 932 mode.getNormalizer2(options).normalize(srcBuffer, app); 933 return app.length(); 934 } 935 936 /** 937 * Normalize a codepoint according to the given mode 938 * @param char32 The input string to be normalized. 939 * @param mode The normalization mode 940 * @param options Options for use with exclusion set and tailored Normalization 941 * The only option that is currently recognized is UNICODE_3_2 942 * @return String The normalized string 943 * @see #UNICODE_3_2 944 * @deprecated ICU 56 Use {@link Normalizer2} instead. 945 * @hide deprecated on icu4j-org 946 */ 947 @Deprecated normalize(int char32, Mode mode, int options)948 public static String normalize(int char32, Mode mode, int options) { 949 if(mode == NFD && options == 0) { 950 String decomposition = Normalizer2.getNFCInstance().getDecomposition(char32); 951 if(decomposition == null) { 952 decomposition = UTF16.valueOf(char32); 953 } 954 return decomposition; 955 } 956 return normalize(UTF16.valueOf(char32), mode, options); 957 } 958 959 /** 960 * Convenience method to normalize a codepoint according to the given mode 961 * @param char32 The input string to be normalized. 962 * @param mode The normalization mode 963 * @return String The normalized string 964 * @deprecated ICU 56 Use {@link Normalizer2} instead. 965 * @hide deprecated on icu4j-org 966 */ 967 @Deprecated normalize(int char32, Mode mode)968 public static String normalize(int char32, Mode mode) { 969 return normalize(char32, mode, 0); 970 } 971 972 /** 973 * Convenience method. 974 * 975 * @param source string for determining if it is in a normalized format 976 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 977 * Normalizer.NFKC,Normalizer.NFKD) 978 * @return Return code to specify if the text is normalized or not 979 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 980 * @deprecated ICU 56 Use {@link Normalizer2} instead. 981 * @hide deprecated on icu4j-org 982 */ 983 @Deprecated quickCheck(String source, Mode mode)984 public static QuickCheckResult quickCheck(String source, Mode mode) { 985 return quickCheck(source, mode, 0); 986 } 987 988 /** 989 * Performing quick check on a string, to quickly determine if the string is 990 * in a particular normalization format. 991 * Three types of result can be returned Normalizer.YES, Normalizer.NO or 992 * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument 993 * string is in the desired normalized format, Normalizer.NO determines that 994 * argument string is not in the desired normalized format. A 995 * Normalizer.MAYBE result indicates that a more thorough check is required, 996 * the user may have to put the string in its normalized form and compare 997 * the results. 998 * 999 * @param source string for determining if it is in a normalized format 1000 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 1001 * Normalizer.NFKC,Normalizer.NFKD) 1002 * @param options Options for use with exclusion set and tailored Normalization 1003 * The only option that is currently recognized is UNICODE_3_2 1004 * @return Return code to specify if the text is normalized or not 1005 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 1006 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1007 * @hide deprecated on icu4j-org 1008 */ 1009 @Deprecated quickCheck(String source, Mode mode, int options)1010 public static QuickCheckResult quickCheck(String source, Mode mode, int options) { 1011 return mode.getNormalizer2(options).quickCheck(source); 1012 } 1013 1014 /** 1015 * Convenience method. 1016 * 1017 * @param source Array of characters for determining if it is in a 1018 * normalized format 1019 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 1020 * Normalizer.NFKC,Normalizer.NFKD) 1021 * @param options Options for use with exclusion set and tailored Normalization 1022 * The only option that is currently recognized is UNICODE_3_2 1023 * @return Return code to specify if the text is normalized or not 1024 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 1025 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1026 * @hide deprecated on icu4j-org 1027 */ 1028 @Deprecated quickCheck(char[] source, Mode mode, int options)1029 public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) { 1030 return quickCheck(source, 0, source.length, mode, options); 1031 } 1032 1033 /** 1034 * Performing quick check on a string, to quickly determine if the string is 1035 * in a particular normalization format. 1036 * Three types of result can be returned Normalizer.YES, Normalizer.NO or 1037 * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument 1038 * string is in the desired normalized format, Normalizer.NO determines that 1039 * argument string is not in the desired normalized format. A 1040 * Normalizer.MAYBE result indicates that a more thorough check is required, 1041 * the user may have to put the string in its normalized form and compare 1042 * the results. 1043 * 1044 * @param source string for determining if it is in a normalized format 1045 * @param start the start index of the source 1046 * @param limit the limit index of the source it is equal to the length 1047 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 1048 * Normalizer.NFKC,Normalizer.NFKD) 1049 * @param options Options for use with exclusion set and tailored Normalization 1050 * The only option that is currently recognized is UNICODE_3_2 1051 * @return Return code to specify if the text is normalized or not 1052 * (Normalizer.YES, Normalizer.NO or 1053 * Normalizer.MAYBE) 1054 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1055 * @hide deprecated on icu4j-org 1056 */ 1057 @Deprecated quickCheck(char[] source,int start, int limit, Mode mode,int options)1058 public static QuickCheckResult quickCheck(char[] source,int start, 1059 int limit, Mode mode,int options) { 1060 CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start); 1061 return mode.getNormalizer2(options).quickCheck(srcBuffer); 1062 } 1063 1064 /** 1065 * Test if a string is in a given normalization form. 1066 * This is semantically equivalent to source.equals(normalize(source, mode)). 1067 * 1068 * Unlike quickCheck(), this function returns a definitive result, 1069 * never a "maybe". 1070 * For NFD, NFKD, and FCD, both functions work exactly the same. 1071 * For NFC and NFKC where quickCheck may return "maybe", this function will 1072 * perform further tests to arrive at a true/false result. 1073 * @param src The input array of characters to be checked to see if 1074 * it is normalized 1075 * @param start The strart index in the source 1076 * @param limit The limit index in the source 1077 * @param mode the normalization mode 1078 * @param options Options for use with exclusion set and tailored Normalization 1079 * The only option that is currently recognized is UNICODE_3_2 1080 * @return Boolean value indicating whether the source string is in the 1081 * "mode" normalization form 1082 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1083 * @hide deprecated on icu4j-org 1084 */ 1085 @Deprecated isNormalized(char[] src,int start, int limit, Mode mode, int options)1086 public static boolean isNormalized(char[] src,int start, 1087 int limit, Mode mode, 1088 int options) { 1089 CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start); 1090 return mode.getNormalizer2(options).isNormalized(srcBuffer); 1091 } 1092 1093 /** 1094 * Test if a string is in a given normalization form. 1095 * This is semantically equivalent to source.equals(normalize(source, mode)). 1096 * 1097 * Unlike quickCheck(), this function returns a definitive result, 1098 * never a "maybe". 1099 * For NFD, NFKD, and FCD, both functions work exactly the same. 1100 * For NFC and NFKC where quickCheck may return "maybe", this function will 1101 * perform further tests to arrive at a true/false result. 1102 * @param str the input string to be checked to see if it is 1103 * normalized 1104 * @param mode the normalization mode 1105 * @param options Options for use with exclusion set and tailored Normalization 1106 * The only option that is currently recognized is UNICODE_3_2 1107 * @see #isNormalized 1108 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1109 * @hide deprecated on icu4j-org 1110 */ 1111 @Deprecated isNormalized(String str, Mode mode, int options)1112 public static boolean isNormalized(String str, Mode mode, int options) { 1113 return mode.getNormalizer2(options).isNormalized(str); 1114 } 1115 1116 /** 1117 * Convenience Method 1118 * @param char32 the input code point to be checked to see if it is 1119 * normalized 1120 * @param mode the normalization mode 1121 * @param options Options for use with exclusion set and tailored Normalization 1122 * The only option that is currently recognized is UNICODE_3_2 1123 * 1124 * @see #isNormalized 1125 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1126 * @hide deprecated on icu4j-org 1127 */ 1128 @Deprecated isNormalized(int char32, Mode mode,int options)1129 public static boolean isNormalized(int char32, Mode mode,int options) { 1130 return isNormalized(UTF16.valueOf(char32), mode, options); 1131 } 1132 1133 /** 1134 * Compare two strings for canonical equivalence. 1135 * Further options include case-insensitive comparison and 1136 * code point order (as opposed to code unit order). 1137 * 1138 * Canonical equivalence between two strings is defined as their normalized 1139 * forms (NFD or NFC) being identical. 1140 * This function compares strings incrementally instead of normalizing 1141 * (and optionally case-folding) both strings entirely, 1142 * improving performance significantly. 1143 * 1144 * Bulk normalization is only necessary if the strings do not fulfill the 1145 * FCD conditions. Only in this case, and only if the strings are relatively 1146 * long, is memory allocated temporarily. 1147 * For FCD strings and short non-FCD strings there is no memory allocation. 1148 * 1149 * Semantically, this is equivalent to 1150 * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2))) 1151 * where code point order and foldCase are all optional. 1152 * 1153 * @param s1 First source character array. 1154 * @param s1Start start index of source 1155 * @param s1Limit limit of the source 1156 * 1157 * @param s2 Second source character array. 1158 * @param s2Start start index of the source 1159 * @param s2Limit limit of the source 1160 * 1161 * @param options A bit set of options: 1162 * - FOLD_CASE_DEFAULT or 0 is used for default options: 1163 * Case-sensitive comparison in code unit order, and the input strings 1164 * are quick-checked for FCD. 1165 * 1166 * - INPUT_IS_FCD 1167 * Set if the caller knows that both s1 and s2 fulfill the FCD 1168 * conditions.If not set, the function will quickCheck for FCD 1169 * and normalize if necessary. 1170 * 1171 * - COMPARE_CODE_POINT_ORDER 1172 * Set to choose code point order instead of code unit order 1173 * 1174 * - COMPARE_IGNORE_CASE 1175 * Set to compare strings case-insensitively using case folding, 1176 * instead of case-sensitively. 1177 * If set, then the following case folding options are used. 1178 * 1179 * 1180 * @return <0 or 0 or >0 as usual for string comparisons 1181 * 1182 * @see #normalize 1183 * @see #FCD 1184 */ compare(char[] s1, int s1Start, int s1Limit, char[] s2, int s2Start, int s2Limit, int options)1185 public static int compare(char[] s1, int s1Start, int s1Limit, 1186 char[] s2, int s2Start, int s2Limit, 1187 int options) { 1188 if( s1==null || s1Start<0 || s1Limit<0 || 1189 s2==null || s2Start<0 || s2Limit<0 || 1190 s1Limit<s1Start || s2Limit<s2Start 1191 ) { 1192 throw new IllegalArgumentException(); 1193 } 1194 return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start), 1195 CharBuffer.wrap(s2, s2Start, s2Limit-s2Start), 1196 options); 1197 } 1198 1199 /** 1200 * Compare two strings for canonical equivalence. 1201 * Further options include case-insensitive comparison and 1202 * code point order (as opposed to code unit order). 1203 * 1204 * Canonical equivalence between two strings is defined as their normalized 1205 * forms (NFD or NFC) being identical. 1206 * This function compares strings incrementally instead of normalizing 1207 * (and optionally case-folding) both strings entirely, 1208 * improving performance significantly. 1209 * 1210 * Bulk normalization is only necessary if the strings do not fulfill the 1211 * FCD conditions. Only in this case, and only if the strings are relatively 1212 * long, is memory allocated temporarily. 1213 * For FCD strings and short non-FCD strings there is no memory allocation. 1214 * 1215 * Semantically, this is equivalent to 1216 * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2))) 1217 * where code point order and foldCase are all optional. 1218 * 1219 * @param s1 First source string. 1220 * @param s2 Second source string. 1221 * 1222 * @param options A bit set of options: 1223 * - FOLD_CASE_DEFAULT or 0 is used for default options: 1224 * Case-sensitive comparison in code unit order, and the input strings 1225 * are quick-checked for FCD. 1226 * 1227 * - INPUT_IS_FCD 1228 * Set if the caller knows that both s1 and s2 fulfill the FCD 1229 * conditions. If not set, the function will quickCheck for FCD 1230 * and normalize if necessary. 1231 * 1232 * - COMPARE_CODE_POINT_ORDER 1233 * Set to choose code point order instead of code unit order 1234 * 1235 * - COMPARE_IGNORE_CASE 1236 * Set to compare strings case-insensitively using case folding, 1237 * instead of case-sensitively. 1238 * If set, then the following case folding options are used. 1239 * 1240 * @return <0 or 0 or >0 as usual for string comparisons 1241 * 1242 * @see #normalize 1243 * @see #FCD 1244 */ compare(String s1, String s2, int options)1245 public static int compare(String s1, String s2, int options) { 1246 return internalCompare(s1, s2, options); 1247 } 1248 1249 /** 1250 * Compare two strings for canonical equivalence. 1251 * Further options include case-insensitive comparison and 1252 * code point order (as opposed to code unit order). 1253 * Convenience method. 1254 * 1255 * @param s1 First source string. 1256 * @param s2 Second source string. 1257 * 1258 * @param options A bit set of options: 1259 * - FOLD_CASE_DEFAULT or 0 is used for default options: 1260 * Case-sensitive comparison in code unit order, and the input strings 1261 * are quick-checked for FCD. 1262 * 1263 * - INPUT_IS_FCD 1264 * Set if the caller knows that both s1 and s2 fulfill the FCD 1265 * conditions. If not set, the function will quickCheck for FCD 1266 * and normalize if necessary. 1267 * 1268 * - COMPARE_CODE_POINT_ORDER 1269 * Set to choose code point order instead of code unit order 1270 * 1271 * - COMPARE_IGNORE_CASE 1272 * Set to compare strings case-insensitively using case folding, 1273 * instead of case-sensitively. 1274 * If set, then the following case folding options are used. 1275 * 1276 * @return <0 or 0 or >0 as usual for string comparisons 1277 * 1278 * @see #normalize 1279 * @see #FCD 1280 */ compare(char[] s1, char[] s2, int options)1281 public static int compare(char[] s1, char[] s2, int options) { 1282 return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options); 1283 } 1284 1285 /** 1286 * Convenience method that can have faster implementation 1287 * by not allocating buffers. 1288 * @param char32a the first code point to be checked against the 1289 * @param char32b the second code point 1290 * @param options A bit set of options 1291 */ compare(int char32a, int char32b, int options)1292 public static int compare(int char32a, int char32b, int options) { 1293 return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD); 1294 } 1295 1296 /** 1297 * Convenience method that can have faster implementation 1298 * by not allocating buffers. 1299 * @param char32a the first code point to be checked against 1300 * @param str2 the second string 1301 * @param options A bit set of options 1302 */ compare(int char32a, String str2, int options)1303 public static int compare(int char32a, String str2, int options) { 1304 return internalCompare(UTF16.valueOf(char32a), str2, options); 1305 } 1306 1307 /* Concatenation of normalized strings --------------------------------- */ 1308 /** 1309 * Concatenate normalized strings, making sure that the result is normalized 1310 * as well. 1311 * 1312 * If both the left and the right strings are in 1313 * the normalization form according to "mode", 1314 * then the result will be 1315 * 1316 * <code> 1317 * dest=normalize(left+right, mode) 1318 * </code> 1319 * 1320 * With the input strings already being normalized, 1321 * this function will use next() and previous() 1322 * to find the adjacent end pieces of the input strings. 1323 * Only the concatenation of these end pieces will be normalized and 1324 * then concatenated with the remaining parts of the input strings. 1325 * 1326 * It is allowed to have dest==left to avoid copying the entire left string. 1327 * 1328 * @param left Left source array, may be same as dest. 1329 * @param leftStart start in the left array. 1330 * @param leftLimit limit in the left array (==length) 1331 * @param right Right source array. 1332 * @param rightStart start in the right array. 1333 * @param rightLimit limit in the right array (==length) 1334 * @param dest The output buffer; can be null if destStart==destLimit==0 1335 * for pure preflighting. 1336 * @param destStart start in the destination array 1337 * @param destLimit limit in the destination array (==length) 1338 * @param mode The normalization mode. 1339 * @param options The normalization options, ORed together (0 for no options). 1340 * @return Length of output (number of chars) when successful or 1341 * IndexOutOfBoundsException 1342 * @exception IndexOutOfBoundsException whose message has the string 1343 * representation of destination capacity required. 1344 * @see #normalize 1345 * @see #next 1346 * @see #previous 1347 * @exception IndexOutOfBoundsException if target capacity is less than the 1348 * required length 1349 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1350 * @hide deprecated on icu4j-org 1351 */ 1352 @Deprecated concatenate(char[] left, int leftStart, int leftLimit, char[] right, int rightStart, int rightLimit, char[] dest, int destStart, int destLimit, Normalizer.Mode mode, int options)1353 public static int concatenate(char[] left, int leftStart, int leftLimit, 1354 char[] right, int rightStart, int rightLimit, 1355 char[] dest, int destStart, int destLimit, 1356 Normalizer.Mode mode, int options) { 1357 if(dest == null) { 1358 throw new IllegalArgumentException(); 1359 } 1360 1361 /* check for overlapping right and destination */ 1362 if (right == dest && rightStart < destLimit && destStart < rightLimit) { 1363 throw new IllegalArgumentException("overlapping right and dst ranges"); 1364 } 1365 1366 /* allow left==dest */ 1367 StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16); 1368 destBuilder.append(left, leftStart, leftLimit-leftStart); 1369 CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart); 1370 mode.getNormalizer2(options).append(destBuilder, rightBuffer); 1371 int destLength=destBuilder.length(); 1372 if(destLength<=(destLimit-destStart)) { 1373 destBuilder.getChars(0, destLength, dest, destStart); 1374 return destLength; 1375 } else { 1376 throw new IndexOutOfBoundsException(Integer.toString(destLength)); 1377 } 1378 } 1379 1380 /** 1381 * Concatenate normalized strings, making sure that the result is normalized 1382 * as well. 1383 * 1384 * If both the left and the right strings are in 1385 * the normalization form according to "mode", 1386 * then the result will be 1387 * 1388 * <code> 1389 * dest=normalize(left+right, mode) 1390 * </code> 1391 * 1392 * For details see concatenate 1393 * 1394 * @param left Left source string. 1395 * @param right Right source string. 1396 * @param mode The normalization mode. 1397 * @param options The normalization options, ORed together (0 for no options). 1398 * @return result 1399 * 1400 * @see #concatenate 1401 * @see #normalize 1402 * @see #next 1403 * @see #previous 1404 * @see #concatenate 1405 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1406 * @hide deprecated on icu4j-org 1407 */ 1408 @Deprecated concatenate(char[] left, char[] right,Mode mode, int options)1409 public static String concatenate(char[] left, char[] right,Mode mode, int options) { 1410 StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left); 1411 return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString(); 1412 } 1413 1414 /** 1415 * Concatenate normalized strings, making sure that the result is normalized 1416 * as well. 1417 * 1418 * If both the left and the right strings are in 1419 * the normalization form according to "mode", 1420 * then the result will be 1421 * 1422 * <code> 1423 * dest=normalize(left+right, mode) 1424 * </code> 1425 * 1426 * With the input strings already being normalized, 1427 * this function will use next() and previous() 1428 * to find the adjacent end pieces of the input strings. 1429 * Only the concatenation of these end pieces will be normalized and 1430 * then concatenated with the remaining parts of the input strings. 1431 * 1432 * @param left Left source string. 1433 * @param right Right source string. 1434 * @param mode The normalization mode. 1435 * @param options The normalization options, ORed together (0 for no options). 1436 * @return result 1437 * 1438 * @see #concatenate 1439 * @see #normalize 1440 * @see #next 1441 * @see #previous 1442 * @see #concatenate 1443 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1444 * @hide deprecated on icu4j-org 1445 */ 1446 @Deprecated concatenate(String left, String right, Mode mode, int options)1447 public static String concatenate(String left, String right, Mode mode, int options) { 1448 StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left); 1449 return mode.getNormalizer2(options).append(dest, right).toString(); 1450 } 1451 1452 /** 1453 * Gets the FC_NFKC closure value. 1454 * @param c The code point whose closure value is to be retrieved 1455 * @param dest The char array to receive the closure value 1456 * @return the length of the closure value; 0 if there is none 1457 * @deprecated ICU 56 1458 * @hide deprecated on icu4j-org 1459 */ 1460 @Deprecated getFC_NFKC_Closure(int c,char[] dest)1461 public static int getFC_NFKC_Closure(int c,char[] dest) { 1462 String closure=getFC_NFKC_Closure(c); 1463 int length=closure.length(); 1464 if(length!=0 && dest!=null && length<=dest.length) { 1465 closure.getChars(0, length, dest, 0); 1466 } 1467 return length; 1468 } 1469 /** 1470 * Gets the FC_NFKC closure value. 1471 * @param c The code point whose closure value is to be retrieved 1472 * @return String representation of the closure value; "" if there is none 1473 * @deprecated ICU 56 1474 * @hide deprecated on icu4j-org 1475 */ 1476 @Deprecated getFC_NFKC_Closure(int c)1477 public static String getFC_NFKC_Closure(int c) { 1478 // Compute the FC_NFKC_Closure on the fly: 1479 // We have the API for complete coverage of Unicode properties, although 1480 // this value by itself is not useful via API. 1481 // (What could be useful is a custom normalization table that combines 1482 // case folding and NFKC.) 1483 // For the derivation, see Unicode's DerivedNormalizationProps.txt. 1484 Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2; 1485 UCaseProps csp=UCaseProps.INSTANCE; 1486 // first: b = NFKC(Fold(a)) 1487 StringBuilder folded=new StringBuilder(); 1488 int folded1Length=csp.toFullFolding(c, folded, 0); 1489 if(folded1Length<0) { 1490 Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl; 1491 if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) { 1492 return ""; // c does not change at all under CaseFolding+NFKC 1493 } 1494 folded.appendCodePoint(c); 1495 } else { 1496 if(folded1Length>UCaseProps.MAX_STRING_LENGTH) { 1497 folded.appendCodePoint(folded1Length); 1498 } 1499 } 1500 String kc1=nfkc.normalize(folded); 1501 // second: c = NFKC(Fold(b)) 1502 String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0)); 1503 // if (c != b) add the mapping from a to c 1504 if(kc1.equals(kc2)) { 1505 return ""; 1506 } else { 1507 return kc2; 1508 } 1509 } 1510 1511 //------------------------------------------------------------------------- 1512 // Iteration API 1513 //------------------------------------------------------------------------- 1514 1515 /** 1516 * Return the current character in the normalized text. 1517 * @return The codepoint as an int 1518 * @deprecated ICU 56 1519 * @hide deprecated on icu4j-org 1520 */ 1521 @Deprecated current()1522 public int current() { 1523 if(bufferPos<buffer.length() || nextNormalize()) { 1524 return buffer.codePointAt(bufferPos); 1525 } else { 1526 return DONE; 1527 } 1528 } 1529 1530 /** 1531 * Return the next character in the normalized text and advance 1532 * the iteration position by one. If the end 1533 * of the text has already been reached, {@link #DONE} is returned. 1534 * @return The codepoint as an int 1535 * @deprecated ICU 56 1536 * @hide deprecated on icu4j-org 1537 */ 1538 @Deprecated next()1539 public int next() { 1540 if(bufferPos<buffer.length() || nextNormalize()) { 1541 int c=buffer.codePointAt(bufferPos); 1542 bufferPos+=Character.charCount(c); 1543 return c; 1544 } else { 1545 return DONE; 1546 } 1547 } 1548 1549 1550 /** 1551 * Return the previous character in the normalized text and decrement 1552 * the iteration position by one. If the beginning 1553 * of the text has already been reached, {@link #DONE} is returned. 1554 * @return The codepoint as an int 1555 * @deprecated ICU 56 1556 * @hide deprecated on icu4j-org 1557 */ 1558 @Deprecated previous()1559 public int previous() { 1560 if(bufferPos>0 || previousNormalize()) { 1561 int c=buffer.codePointBefore(bufferPos); 1562 bufferPos-=Character.charCount(c); 1563 return c; 1564 } else { 1565 return DONE; 1566 } 1567 } 1568 1569 /** 1570 * Reset the index to the beginning of the text. 1571 * This is equivalent to setIndexOnly(startIndex)). 1572 * @deprecated ICU 56 1573 * @hide deprecated on icu4j-org 1574 */ 1575 @Deprecated reset()1576 public void reset() { 1577 text.setToStart(); 1578 currentIndex=nextIndex=0; 1579 clearBuffer(); 1580 } 1581 1582 /** 1583 * Set the iteration position in the input text that is being normalized, 1584 * without any immediate normalization. 1585 * After setIndexOnly(), getIndex() will return the same index that is 1586 * specified here. 1587 * 1588 * @param index the desired index in the input text. 1589 * @deprecated ICU 56 1590 * @hide deprecated on icu4j-org 1591 */ 1592 @Deprecated setIndexOnly(int index)1593 public void setIndexOnly(int index) { 1594 text.setIndex(index); // validates index 1595 currentIndex=nextIndex=index; 1596 clearBuffer(); 1597 } 1598 1599 /** 1600 * Set the iteration position in the input text that is being normalized 1601 * and return the first normalized character at that position. 1602 * <p> 1603 * <b>Note:</b> This method sets the position in the <em>input</em> text, 1604 * while {@link #next} and {@link #previous} iterate through characters 1605 * in the normalized <em>output</em>. This means that there is not 1606 * necessarily a one-to-one correspondence between characters returned 1607 * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and 1608 * returned from <tt>setIndex</tt> and {@link #getIndex}. 1609 * <p> 1610 * @param index the desired index in the input text. 1611 * 1612 * @return the first normalized character that is the result of iterating 1613 * forward starting at the given index. 1614 * 1615 * @throws IllegalArgumentException if the given index is less than 1616 * {@link #getBeginIndex} or greater than {@link #getEndIndex}. 1617 * @deprecated ICU 3.2 1618 * @obsolete ICU 3.2 1619 * @hide deprecated on icu4j-org 1620 */ 1621 @Deprecated 1622 ///CLOVER:OFF setIndex(int index)1623 public int setIndex(int index) { 1624 setIndexOnly(index); 1625 return current(); 1626 } 1627 ///CLOVER:ON 1628 /** 1629 * Retrieve the index of the start of the input text. This is the begin 1630 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the 1631 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating 1632 * @deprecated ICU 2.2. Use startIndex() instead. 1633 * @return The codepoint as an int 1634 * @see #startIndex 1635 * @hide deprecated on icu4j-org 1636 */ 1637 @Deprecated getBeginIndex()1638 public int getBeginIndex() { 1639 return 0; 1640 } 1641 1642 /** 1643 * Retrieve the index of the end of the input text. This is the end index 1644 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 1645 * over which this <tt>Normalizer</tt> is iterating 1646 * @deprecated ICU 2.2. Use endIndex() instead. 1647 * @return The codepoint as an int 1648 * @see #endIndex 1649 * @hide deprecated on icu4j-org 1650 */ 1651 @Deprecated getEndIndex()1652 public int getEndIndex() { 1653 return endIndex(); 1654 } 1655 /** 1656 * Return the first character in the normalized text. This resets 1657 * the <tt>Normalizer's</tt> position to the beginning of the text. 1658 * @return The codepoint as an int 1659 * @deprecated ICU 56 1660 * @hide deprecated on icu4j-org 1661 */ 1662 @Deprecated first()1663 public int first() { 1664 reset(); 1665 return next(); 1666 } 1667 1668 /** 1669 * Return the last character in the normalized text. This resets 1670 * the <tt>Normalizer's</tt> position to be just before the 1671 * the input text corresponding to that normalized character. 1672 * @return The codepoint as an int 1673 * @deprecated ICU 56 1674 * @hide deprecated on icu4j-org 1675 */ 1676 @Deprecated last()1677 public int last() { 1678 text.setToLimit(); 1679 currentIndex=nextIndex=text.getIndex(); 1680 clearBuffer(); 1681 return previous(); 1682 } 1683 1684 /** 1685 * Retrieve the current iteration position in the input text that is 1686 * being normalized. This method is useful in applications such as 1687 * searching, where you need to be able to determine the position in 1688 * the input text that corresponds to a given normalized output character. 1689 * <p> 1690 * <b>Note:</b> This method sets the position in the <em>input</em>, while 1691 * {@link #next} and {@link #previous} iterate through characters in the 1692 * <em>output</em>. This means that there is not necessarily a one-to-one 1693 * correspondence between characters returned by <tt>next</tt> and 1694 * <tt>previous</tt> and the indices passed to and returned from 1695 * <tt>setIndex</tt> and {@link #getIndex}. 1696 * @return The current iteration position 1697 * @deprecated ICU 56 1698 * @hide deprecated on icu4j-org 1699 */ 1700 @Deprecated getIndex()1701 public int getIndex() { 1702 if(bufferPos<buffer.length()) { 1703 return currentIndex; 1704 } else { 1705 return nextIndex; 1706 } 1707 } 1708 1709 /** 1710 * Retrieve the index of the start of the input text. This is the begin 1711 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the 1712 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating 1713 * @return The current iteration position 1714 * @deprecated ICU 56 1715 * @hide deprecated on icu4j-org 1716 */ 1717 @Deprecated startIndex()1718 public int startIndex() { 1719 return 0; 1720 } 1721 1722 /** 1723 * Retrieve the index of the end of the input text. This is the end index 1724 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 1725 * over which this <tt>Normalizer</tt> is iterating 1726 * @return The current iteration position 1727 * @deprecated ICU 56 1728 * @hide deprecated on icu4j-org 1729 */ 1730 @Deprecated endIndex()1731 public int endIndex() { 1732 return text.getLength(); 1733 } 1734 1735 //------------------------------------------------------------------------- 1736 // Iterator attributes 1737 //------------------------------------------------------------------------- 1738 /** 1739 * Set the normalization mode for this object. 1740 * <p> 1741 * <b>Note:</b>If the normalization mode is changed while iterating 1742 * over a string, calls to {@link #next} and {@link #previous} may 1743 * return previously buffers characters in the old normalization mode 1744 * until the iteration is able to re-sync at the next base character. 1745 * It is safest to call {@link #setText setText()}, {@link #first}, 1746 * {@link #last}, etc. after calling <tt>setMode</tt>. 1747 * <p> 1748 * @param newMode the new mode for this <tt>Normalizer</tt>. 1749 * The supported modes are: 1750 * <ul> 1751 * <li>{@link #NFC} - Unicode canonical decompositiion 1752 * followed by canonical composition. 1753 * <li>{@link #NFKC} - Unicode compatibility decompositiion 1754 * follwed by canonical composition. 1755 * <li>{@link #NFD} - Unicode canonical decomposition 1756 * <li>{@link #NFKD} - Unicode compatibility decomposition. 1757 * <li>{@link #NONE} - Do nothing but return characters 1758 * from the underlying input text. 1759 * </ul> 1760 * 1761 * @see #getMode 1762 * @deprecated ICU 56 1763 * @hide deprecated on icu4j-org 1764 */ 1765 @Deprecated setMode(Mode newMode)1766 public void setMode(Mode newMode) { 1767 mode = newMode; 1768 norm2 = mode.getNormalizer2(options); 1769 } 1770 /** 1771 * Return the basic operation performed by this <tt>Normalizer</tt> 1772 * 1773 * @see #setMode 1774 * @deprecated ICU 56 1775 * @hide deprecated on icu4j-org 1776 */ 1777 @Deprecated getMode()1778 public Mode getMode() { 1779 return mode; 1780 } 1781 /** 1782 * Set options that affect this <tt>Normalizer</tt>'s operation. 1783 * Options do not change the basic composition or decomposition operation 1784 * that is being performed , but they control whether 1785 * certain optional portions of the operation are done. 1786 * Currently the only available option is: 1787 * 1788 * <ul> 1789 * <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2. 1790 * </ul> 1791 * 1792 * @param option the option whose value is to be set. 1793 * @param value the new setting for the option. Use <tt>true</tt> to 1794 * turn the option on and <tt>false</tt> to turn it off. 1795 * 1796 * @see #getOption 1797 * @deprecated ICU 56 1798 * @hide deprecated on icu4j-org 1799 */ 1800 @Deprecated setOption(int option,boolean value)1801 public void setOption(int option,boolean value) { 1802 if (value) { 1803 options |= option; 1804 } else { 1805 options &= (~option); 1806 } 1807 norm2 = mode.getNormalizer2(options); 1808 } 1809 1810 /** 1811 * Determine whether an option is turned on or off. 1812 * <p> 1813 * @see #setOption 1814 * @deprecated ICU 56 1815 * @hide deprecated on icu4j-org 1816 */ 1817 @Deprecated getOption(int option)1818 public int getOption(int option) { 1819 if((options & option)!=0) { 1820 return 1 ; 1821 } else { 1822 return 0; 1823 } 1824 } 1825 1826 /** 1827 * Gets the underlying text storage 1828 * @param fillIn the char buffer to fill the UTF-16 units. 1829 * The length of the buffer should be equal to the length of the 1830 * underlying text storage 1831 * @throws IndexOutOfBoundsException If the index passed for the array is invalid. 1832 * @see #getLength 1833 * @deprecated ICU 56 1834 * @hide deprecated on icu4j-org 1835 */ 1836 @Deprecated getText(char[] fillIn)1837 public int getText(char[] fillIn) { 1838 return text.getText(fillIn); 1839 } 1840 1841 /** 1842 * Gets the length of underlying text storage 1843 * @return the length 1844 * @deprecated ICU 56 1845 * @hide deprecated on icu4j-org 1846 */ 1847 @Deprecated getLength()1848 public int getLength() { 1849 return text.getLength(); 1850 } 1851 1852 /** 1853 * Returns the text under iteration as a string 1854 * @return a copy of the text under iteration. 1855 * @deprecated ICU 56 1856 * @hide deprecated on icu4j-org 1857 */ 1858 @Deprecated getText()1859 public String getText() { 1860 return text.getText(); 1861 } 1862 1863 /** 1864 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1865 * The iteration position is set to the beginning of the input text. 1866 * @param newText The new string to be normalized. 1867 * @deprecated ICU 56 1868 * @hide deprecated on icu4j-org 1869 */ 1870 @Deprecated setText(StringBuffer newText)1871 public void setText(StringBuffer newText) { 1872 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1873 if (newIter == null) { 1874 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1875 } 1876 text = newIter; 1877 reset(); 1878 } 1879 1880 /** 1881 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1882 * The iteration position is set to the beginning of the input text. 1883 * @param newText The new string to be normalized. 1884 * @deprecated ICU 56 1885 * @hide deprecated on icu4j-org 1886 */ 1887 @Deprecated setText(char[] newText)1888 public void setText(char[] newText) { 1889 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1890 if (newIter == null) { 1891 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1892 } 1893 text = newIter; 1894 reset(); 1895 } 1896 1897 /** 1898 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1899 * The iteration position is set to the beginning of the input text. 1900 * @param newText The new string to be normalized. 1901 * @deprecated ICU 56 1902 * @hide deprecated on icu4j-org 1903 */ 1904 @Deprecated setText(String newText)1905 public void setText(String newText) { 1906 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1907 if (newIter == null) { 1908 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1909 } 1910 text = newIter; 1911 reset(); 1912 } 1913 1914 /** 1915 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1916 * The iteration position is set to the beginning of the input text. 1917 * @param newText The new string to be normalized. 1918 * @deprecated ICU 56 1919 * @hide deprecated on icu4j-org 1920 */ 1921 @Deprecated setText(CharacterIterator newText)1922 public void setText(CharacterIterator newText) { 1923 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1924 if (newIter == null) { 1925 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1926 } 1927 text = newIter; 1928 reset(); 1929 } 1930 1931 /** 1932 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1933 * The iteration position is set to the beginning of the string. 1934 * @param newText The new string to be normalized. 1935 * @deprecated ICU 56 1936 * @hide deprecated on icu4j-org 1937 */ 1938 @Deprecated setText(UCharacterIterator newText)1939 public void setText(UCharacterIterator newText) { 1940 try{ 1941 UCharacterIterator newIter = (UCharacterIterator)newText.clone(); 1942 if (newIter == null) { 1943 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1944 } 1945 text = newIter; 1946 reset(); 1947 }catch(CloneNotSupportedException e) { 1948 throw new ICUCloneNotSupportedException("Could not clone the UCharacterIterator", e); 1949 } 1950 } 1951 clearBuffer()1952 private void clearBuffer() { 1953 buffer.setLength(0); 1954 bufferPos=0; 1955 } 1956 nextNormalize()1957 private boolean nextNormalize() { 1958 clearBuffer(); 1959 currentIndex=nextIndex; 1960 text.setIndex(nextIndex); 1961 // Skip at least one character so we make progress. 1962 int c=text.nextCodePoint(); 1963 if(c<0) { 1964 return false; 1965 } 1966 StringBuilder segment=new StringBuilder().appendCodePoint(c); 1967 while((c=text.nextCodePoint())>=0) { 1968 if(norm2.hasBoundaryBefore(c)) { 1969 text.moveCodePointIndex(-1); 1970 break; 1971 } 1972 segment.appendCodePoint(c); 1973 } 1974 nextIndex=text.getIndex(); 1975 norm2.normalize(segment, buffer); 1976 return buffer.length()!=0; 1977 } 1978 previousNormalize()1979 private boolean previousNormalize() { 1980 clearBuffer(); 1981 nextIndex=currentIndex; 1982 text.setIndex(currentIndex); 1983 StringBuilder segment=new StringBuilder(); 1984 int c; 1985 while((c=text.previousCodePoint())>=0) { 1986 if(c<=0xffff) { 1987 segment.insert(0, (char)c); 1988 } else { 1989 segment.insert(0, Character.toChars(c)); 1990 } 1991 if(norm2.hasBoundaryBefore(c)) { 1992 break; 1993 } 1994 } 1995 currentIndex=text.getIndex(); 1996 norm2.normalize(segment, buffer); 1997 bufferPos=buffer.length(); 1998 return buffer.length()!=0; 1999 } 2000 2001 /* compare canonically equivalent ------------------------------------------- */ 2002 2003 // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407 internalCompare(CharSequence s1, CharSequence s2, int options)2004 private static int internalCompare(CharSequence s1, CharSequence s2, int options) { 2005 int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT; 2006 options|= COMPARE_EQUIV; 2007 2008 /* 2009 * UAX #21 Case Mappings, as fixed for Unicode version 4 2010 * (see Jitterbug 2021), defines a canonical caseless match as 2011 * 2012 * A string X is a canonical caseless match 2013 * for a string Y if and only if 2014 * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) 2015 * 2016 * For better performance, we check for FCD (or let the caller tell us that 2017 * both strings are in FCD) for the inner normalization. 2018 * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that 2019 * case-folding preserves the FCD-ness of a string. 2020 * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold() 2021 * when there is a difference. 2022 * 2023 * Exception: When using the Turkic case-folding option, we do perform 2024 * full NFD first. This is because in the Turkic case precomposed characters 2025 * with 0049 capital I or 0069 small i fold differently whether they 2026 * are first decomposed or not, so an FCD check - a check only for 2027 * canonical order - is not sufficient. 2028 */ 2029 if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) { 2030 Normalizer2 n2; 2031 if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) { 2032 n2=NFD.getNormalizer2(normOptions); 2033 } else { 2034 n2=FCD.getNormalizer2(normOptions); 2035 } 2036 2037 // check if s1 and/or s2 fulfill the FCD conditions 2038 int spanQCYes1=n2.spanQuickCheckYes(s1); 2039 int spanQCYes2=n2.spanQuickCheckYes(s2); 2040 2041 /* 2042 * ICU 2.4 had a further optimization: 2043 * If both strings were not in FCD, then they were both NFD'ed, 2044 * and the COMPARE_EQUIV option was turned off. 2045 * It is not entirely clear that this is valid with the current 2046 * definition of the canonical caseless match. 2047 * Therefore, ICU 2.6 removes that optimization. 2048 */ 2049 2050 if(spanQCYes1<s1.length()) { 2051 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1); 2052 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length())); 2053 } 2054 if(spanQCYes2<s2.length()) { 2055 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2); 2056 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length())); 2057 } 2058 } 2059 2060 return cmpEquivFold(s1, s2, options); 2061 } 2062 2063 /* 2064 * Compare two strings for canonical equivalence. 2065 * Further options include case-insensitive comparison and 2066 * code point order (as opposed to code unit order). 2067 * 2068 * In this function, canonical equivalence is optional as well. 2069 * If canonical equivalence is tested, then both strings must fulfill 2070 * the FCD check. 2071 * 2072 * Semantically, this is equivalent to 2073 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) 2074 * where code point order, NFD and foldCase are all optional. 2075 * 2076 * String comparisons almost always yield results before processing both strings 2077 * completely. 2078 * They are generally more efficient working incrementally instead of 2079 * performing the sub-processing (strlen, normalization, case-folding) 2080 * on the entire strings first. 2081 * 2082 * It is also unnecessary to not normalize identical characters. 2083 * 2084 * This function works in principle as follows: 2085 * 2086 * loop { 2087 * get one code unit c1 from s1 (-1 if end of source) 2088 * get one code unit c2 from s2 (-1 if end of source) 2089 * 2090 * if(either string finished) { 2091 * return result; 2092 * } 2093 * if(c1==c2) { 2094 * continue; 2095 * } 2096 * 2097 * // c1!=c2 2098 * try to decompose/case-fold c1/c2, and continue if one does; 2099 * 2100 * // still c1!=c2 and neither decomposes/case-folds, return result 2101 * return c1-c2; 2102 * } 2103 * 2104 * When a character decomposes, then the pointer for that source changes to 2105 * the decomposition, pushing the previous pointer onto a stack. 2106 * When the end of the decomposition is reached, then the code unit reader 2107 * pops the previous source from the stack. 2108 * (Same for case-folding.) 2109 * 2110 * This is complicated further by operating on variable-width UTF-16. 2111 * The top part of the loop works on code units, while lookups for decomposition 2112 * and case-folding need code points. 2113 * Code points are assembled after the equality/end-of-source part. 2114 * The source pointer is only advanced beyond all code units when the code point 2115 * actually decomposes/case-folds. 2116 * 2117 * If we were on a trail surrogate unit when assembling a code point, 2118 * and the code point decomposes/case-folds, then the decomposition/folding 2119 * result must be compared with the part of the other string that corresponds to 2120 * this string's lead surrogate. 2121 * Since we only assemble a code point when hitting a trail unit when the 2122 * preceding lead units were identical, we back up the other string by one unit 2123 * in such a case. 2124 * 2125 * The optional code point order comparison at the end works with 2126 * the same fix-up as the other code point order comparison functions. 2127 * See ustring.c and the comment near the end of this function. 2128 * 2129 * Assumption: A decomposition or case-folding result string never contains 2130 * a single surrogate. This is a safe assumption in the Unicode Standard. 2131 * Therefore, we do not need to check for surrogate pairs across 2132 * decomposition/case-folding boundaries. 2133 * 2134 * Further assumptions (see verifications tstnorm.cpp): 2135 * The API function checks for FCD first, while the core function 2136 * first case-folds and then decomposes. This requires that case-folding does not 2137 * un-FCD any strings. 2138 * 2139 * The API function may also NFD the input and turn off decomposition. 2140 * This requires that case-folding does not un-NFD strings either. 2141 * 2142 * TODO If any of the above two assumptions is violated, 2143 * then this entire code must be re-thought. 2144 * If this happens, then a simple solution is to case-fold both strings up front 2145 * and to turn off UNORM_INPUT_IS_FCD. 2146 * We already do this when not both strings are in FCD because makeFCD 2147 * would be a partial NFD before the case folding, which does not work. 2148 * Note that all of this is only a problem when case-folding _and_ 2149 * canonical equivalence come together. 2150 * (Comments in unorm_compare() are more up to date than this TODO.) 2151 */ 2152 2153 /* stack element for previous-level source/decomposition pointers */ 2154 private static final class CmpEquivLevel { 2155 CharSequence cs; 2156 int s; 2157 }; createCmpEquivLevelStack()2158 private static final CmpEquivLevel[] createCmpEquivLevelStack() { 2159 return new CmpEquivLevel[] { 2160 new CmpEquivLevel(), new CmpEquivLevel() 2161 }; 2162 } 2163 2164 /** 2165 * Internal option for unorm_cmpEquivFold() for decomposing. 2166 * If not set, just do strcasecmp(). 2167 */ 2168 private static final int COMPARE_EQUIV=0x80000; 2169 2170 /* internal function; package visibility for use by UTF16.StringComparator */ cmpEquivFold(CharSequence cs1, CharSequence cs2, int options)2171 /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) { 2172 Normalizer2Impl nfcImpl; 2173 UCaseProps csp; 2174 2175 /* current-level start/limit - s1/s2 as current */ 2176 int s1, s2, limit1, limit2; 2177 2178 /* decomposition and case folding variables */ 2179 int length; 2180 2181 /* stacks of previous-level start/current/limit */ 2182 CmpEquivLevel[] stack1=null, stack2=null; 2183 2184 /* buffers for algorithmic decompositions */ 2185 String decomp1, decomp2; 2186 2187 /* case folding buffers, only use current-level start/limit */ 2188 StringBuilder fold1, fold2; 2189 2190 /* track which is the current level per string */ 2191 int level1, level2; 2192 2193 /* current code units, and code points for lookups */ 2194 int c1, c2, cp1, cp2; 2195 2196 /* no argument error checking because this itself is not an API */ 2197 2198 /* 2199 * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set 2200 * otherwise this function must behave exactly as uprv_strCompare() 2201 * not checking for that here makes testing this function easier 2202 */ 2203 2204 /* normalization/properties data loaded? */ 2205 if((options&COMPARE_EQUIV)!=0) { 2206 nfcImpl=Norm2AllModes.getNFCInstance().impl; 2207 } else { 2208 nfcImpl=null; 2209 } 2210 if((options&COMPARE_IGNORE_CASE)!=0) { 2211 csp=UCaseProps.INSTANCE; 2212 fold1=new StringBuilder(); 2213 fold2=new StringBuilder(); 2214 } else { 2215 csp=null; 2216 fold1=fold2=null; 2217 } 2218 2219 /* initialize */ 2220 s1=0; 2221 limit1=cs1.length(); 2222 s2=0; 2223 limit2=cs2.length(); 2224 2225 level1=level2=0; 2226 c1=c2=-1; 2227 2228 /* comparison loop */ 2229 for(;;) { 2230 /* 2231 * here a code unit value of -1 means "get another code unit" 2232 * below it will mean "this source is finished" 2233 */ 2234 2235 if(c1<0) { 2236 /* get next code unit from string 1, post-increment */ 2237 for(;;) { 2238 if(s1==limit1) { 2239 if(level1==0) { 2240 c1=-1; 2241 break; 2242 } 2243 } else { 2244 c1=cs1.charAt(s1++); 2245 break; 2246 } 2247 2248 /* reached end of level buffer, pop one level */ 2249 do { 2250 --level1; 2251 cs1=stack1[level1].cs; 2252 } while(cs1==null); 2253 s1=stack1[level1].s; 2254 limit1=cs1.length(); 2255 } 2256 } 2257 2258 if(c2<0) { 2259 /* get next code unit from string 2, post-increment */ 2260 for(;;) { 2261 if(s2==limit2) { 2262 if(level2==0) { 2263 c2=-1; 2264 break; 2265 } 2266 } else { 2267 c2=cs2.charAt(s2++); 2268 break; 2269 } 2270 2271 /* reached end of level buffer, pop one level */ 2272 do { 2273 --level2; 2274 cs2=stack2[level2].cs; 2275 } while(cs2==null); 2276 s2=stack2[level2].s; 2277 limit2=cs2.length(); 2278 } 2279 } 2280 2281 /* 2282 * compare c1 and c2 2283 * either variable c1, c2 is -1 only if the corresponding string is finished 2284 */ 2285 if(c1==c2) { 2286 if(c1<0) { 2287 return 0; /* c1==c2==-1 indicating end of strings */ 2288 } 2289 c1=c2=-1; /* make us fetch new code units */ 2290 continue; 2291 } else if(c1<0) { 2292 return -1; /* string 1 ends before string 2 */ 2293 } else if(c2<0) { 2294 return 1; /* string 2 ends before string 1 */ 2295 } 2296 /* c1!=c2 && c1>=0 && c2>=0 */ 2297 2298 /* get complete code points for c1, c2 for lookups if either is a surrogate */ 2299 cp1=c1; 2300 if(UTF16.isSurrogate((char)c1)) { 2301 char c; 2302 2303 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 2304 if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) { 2305 /* advance ++s1; only below if cp1 decomposes/case-folds */ 2306 cp1=Character.toCodePoint((char)c1, c); 2307 } 2308 } else /* isTrail(c1) */ { 2309 if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) { 2310 cp1=Character.toCodePoint(c, (char)c1); 2311 } 2312 } 2313 } 2314 2315 cp2=c2; 2316 if(UTF16.isSurrogate((char)c2)) { 2317 char c; 2318 2319 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 2320 if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) { 2321 /* advance ++s2; only below if cp2 decomposes/case-folds */ 2322 cp2=Character.toCodePoint((char)c2, c); 2323 } 2324 } else /* isTrail(c2) */ { 2325 if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) { 2326 cp2=Character.toCodePoint(c, (char)c2); 2327 } 2328 } 2329 } 2330 2331 /* 2332 * go down one level for each string 2333 * continue with the main loop as soon as there is a real change 2334 */ 2335 2336 if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 && 2337 (length=csp.toFullFolding(cp1, fold1, options))>=0 2338 ) { 2339 /* cp1 case-folds to the code point "length" or to p[length] */ 2340 if(UTF16.isSurrogate((char)c1)) { 2341 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 2342 /* advance beyond source surrogate pair if it case-folds */ 2343 ++s1; 2344 } else /* isTrail(c1) */ { 2345 /* 2346 * we got a supplementary code point when hitting its trail surrogate, 2347 * therefore the lead surrogate must have been the same as in the other string; 2348 * compare this decomposition with the lead surrogate in the other string 2349 * remember that this simulates bulk text replacement: 2350 * the decomposition would replace the entire code point 2351 */ 2352 --s2; 2353 c2=cs2.charAt(s2-1); 2354 } 2355 } 2356 2357 /* push current level pointers */ 2358 if(stack1==null) { 2359 stack1=createCmpEquivLevelStack(); 2360 } 2361 stack1[0].cs=cs1; 2362 stack1[0].s=s1; 2363 ++level1; 2364 2365 /* copy the folding result to fold1[] */ 2366 /* Java: the buffer was probably not empty, remove the old contents */ 2367 if(length<=UCaseProps.MAX_STRING_LENGTH) { 2368 fold1.delete(0, fold1.length()-length); 2369 } else { 2370 fold1.setLength(0); 2371 fold1.appendCodePoint(length); 2372 } 2373 2374 /* set next level pointers to case folding */ 2375 cs1=fold1; 2376 s1=0; 2377 limit1=fold1.length(); 2378 2379 /* get ready to read from decomposition, continue with loop */ 2380 c1=-1; 2381 continue; 2382 } 2383 2384 if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 && 2385 (length=csp.toFullFolding(cp2, fold2, options))>=0 2386 ) { 2387 /* cp2 case-folds to the code point "length" or to p[length] */ 2388 if(UTF16.isSurrogate((char)c2)) { 2389 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 2390 /* advance beyond source surrogate pair if it case-folds */ 2391 ++s2; 2392 } else /* isTrail(c2) */ { 2393 /* 2394 * we got a supplementary code point when hitting its trail surrogate, 2395 * therefore the lead surrogate must have been the same as in the other string; 2396 * compare this decomposition with the lead surrogate in the other string 2397 * remember that this simulates bulk text replacement: 2398 * the decomposition would replace the entire code point 2399 */ 2400 --s1; 2401 c1=cs1.charAt(s1-1); 2402 } 2403 } 2404 2405 /* push current level pointers */ 2406 if(stack2==null) { 2407 stack2=createCmpEquivLevelStack(); 2408 } 2409 stack2[0].cs=cs2; 2410 stack2[0].s=s2; 2411 ++level2; 2412 2413 /* copy the folding result to fold2[] */ 2414 /* Java: the buffer was probably not empty, remove the old contents */ 2415 if(length<=UCaseProps.MAX_STRING_LENGTH) { 2416 fold2.delete(0, fold2.length()-length); 2417 } else { 2418 fold2.setLength(0); 2419 fold2.appendCodePoint(length); 2420 } 2421 2422 /* set next level pointers to case folding */ 2423 cs2=fold2; 2424 s2=0; 2425 limit2=fold2.length(); 2426 2427 /* get ready to read from decomposition, continue with loop */ 2428 c2=-1; 2429 continue; 2430 } 2431 2432 if( level1<2 && (options&COMPARE_EQUIV)!=0 && 2433 (decomp1=nfcImpl.getDecomposition(cp1))!=null 2434 ) { 2435 /* cp1 decomposes into p[length] */ 2436 if(UTF16.isSurrogate((char)c1)) { 2437 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 2438 /* advance beyond source surrogate pair if it decomposes */ 2439 ++s1; 2440 } else /* isTrail(c1) */ { 2441 /* 2442 * we got a supplementary code point when hitting its trail surrogate, 2443 * therefore the lead surrogate must have been the same as in the other string; 2444 * compare this decomposition with the lead surrogate in the other string 2445 * remember that this simulates bulk text replacement: 2446 * the decomposition would replace the entire code point 2447 */ 2448 --s2; 2449 c2=cs2.charAt(s2-1); 2450 } 2451 } 2452 2453 /* push current level pointers */ 2454 if(stack1==null) { 2455 stack1=createCmpEquivLevelStack(); 2456 } 2457 stack1[level1].cs=cs1; 2458 stack1[level1].s=s1; 2459 ++level1; 2460 2461 /* set empty intermediate level if skipped */ 2462 if(level1<2) { 2463 stack1[level1++].cs=null; 2464 } 2465 2466 /* set next level pointers to decomposition */ 2467 cs1=decomp1; 2468 s1=0; 2469 limit1=decomp1.length(); 2470 2471 /* get ready to read from decomposition, continue with loop */ 2472 c1=-1; 2473 continue; 2474 } 2475 2476 if( level2<2 && (options&COMPARE_EQUIV)!=0 && 2477 (decomp2=nfcImpl.getDecomposition(cp2))!=null 2478 ) { 2479 /* cp2 decomposes into p[length] */ 2480 if(UTF16.isSurrogate((char)c2)) { 2481 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 2482 /* advance beyond source surrogate pair if it decomposes */ 2483 ++s2; 2484 } else /* isTrail(c2) */ { 2485 /* 2486 * we got a supplementary code point when hitting its trail surrogate, 2487 * therefore the lead surrogate must have been the same as in the other string; 2488 * compare this decomposition with the lead surrogate in the other string 2489 * remember that this simulates bulk text replacement: 2490 * the decomposition would replace the entire code point 2491 */ 2492 --s1; 2493 c1=cs1.charAt(s1-1); 2494 } 2495 } 2496 2497 /* push current level pointers */ 2498 if(stack2==null) { 2499 stack2=createCmpEquivLevelStack(); 2500 } 2501 stack2[level2].cs=cs2; 2502 stack2[level2].s=s2; 2503 ++level2; 2504 2505 /* set empty intermediate level if skipped */ 2506 if(level2<2) { 2507 stack2[level2++].cs=null; 2508 } 2509 2510 /* set next level pointers to decomposition */ 2511 cs2=decomp2; 2512 s2=0; 2513 limit2=decomp2.length(); 2514 2515 /* get ready to read from decomposition, continue with loop */ 2516 c2=-1; 2517 continue; 2518 } 2519 2520 /* 2521 * no decomposition/case folding, max level for both sides: 2522 * return difference result 2523 * 2524 * code point order comparison must not just return cp1-cp2 2525 * because when single surrogates are present then the surrogate pairs 2526 * that formed cp1 and cp2 may be from different string indexes 2527 * 2528 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units 2529 * c1=d800 cp1=10001 c2=dc00 cp2=10000 2530 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } 2531 * 2532 * therefore, use same fix-up as in ustring.c/uprv_strCompare() 2533 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ 2534 * so we have slightly different pointer/start/limit comparisons here 2535 */ 2536 2537 if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) { 2538 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 2539 if( 2540 (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) || 2541 (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2))) 2542 ) { 2543 /* part of a surrogate pair, leave >=d800 */ 2544 } else { 2545 /* BMP code point - may be surrogate code point - make <d800 */ 2546 c1-=0x2800; 2547 } 2548 2549 if( 2550 (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) || 2551 (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2))) 2552 ) { 2553 /* part of a surrogate pair, leave >=d800 */ 2554 } else { 2555 /* BMP code point - may be surrogate code point - make <d800 */ 2556 c2-=0x2800; 2557 } 2558 } 2559 2560 return c1-c2; 2561 } 2562 } 2563 2564 /** 2565 * An Appendable that writes into a char array with a capacity that may be 2566 * less than array.length. 2567 * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.) 2568 * <p> 2569 * An overflow is only reported at the end, for the old Normalizer API functions that write 2570 * to char arrays. 2571 */ 2572 private static final class CharsAppendable implements Appendable { CharsAppendable(char[] dest, int destStart, int destLimit)2573 public CharsAppendable(char[] dest, int destStart, int destLimit) { 2574 chars=dest; 2575 start=offset=destStart; 2576 limit=destLimit; 2577 } length()2578 public int length() { 2579 int len=offset-start; 2580 if(offset<=limit) { 2581 return len; 2582 } else { 2583 throw new IndexOutOfBoundsException(Integer.toString(len)); 2584 } 2585 } 2586 @Override append(char c)2587 public Appendable append(char c) { 2588 if(offset<limit) { 2589 chars[offset]=c; 2590 } 2591 ++offset; 2592 return this; 2593 } 2594 @Override append(CharSequence s)2595 public Appendable append(CharSequence s) { 2596 return append(s, 0, s.length()); 2597 } 2598 @Override append(CharSequence s, int sStart, int sLimit)2599 public Appendable append(CharSequence s, int sStart, int sLimit) { 2600 int len=sLimit-sStart; 2601 if(len<=(limit-offset)) { 2602 while(sStart<sLimit) { // TODO: Is there a better way to copy the characters? 2603 chars[offset++]=s.charAt(sStart++); 2604 } 2605 } else { 2606 offset+=len; 2607 } 2608 return this; 2609 } 2610 2611 private final char[] chars; 2612 private final int start, limit; 2613 private int offset; 2614 } 2615 } 2616