1 /* 2 ******************************************************************************* 3 * Copyright (C) 2000-2016, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.text; 8 import java.nio.CharBuffer; 9 import java.text.CharacterIterator; 10 11 import com.ibm.icu.impl.Norm2AllModes; 12 import com.ibm.icu.impl.Normalizer2Impl; 13 import com.ibm.icu.impl.UCaseProps; 14 import com.ibm.icu.lang.UCharacter; 15 import com.ibm.icu.util.ICUCloneNotSupportedException; 16 17 /** 18 * Old Unicode normalization API. 19 * 20 * <p>This API has been replaced by the {@link Normalizer2} class and is only available 21 * for backward compatibility. This class simply delegates to the Normalizer2 class. 22 * There are two exceptions: The new API does not provide a replacement for 23 * <code>QuickCheckResult</code> and <code>compare()</code>. 24 * 25 * <p><code>normalize</code> transforms Unicode text into an equivalent composed or 26 * decomposed form, allowing for easier sorting and searching of text. 27 * <code>normalize</code> supports the standard normalization forms described in 28 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> 29 * Unicode Standard Annex #15 — Unicode Normalization Forms</a>. 30 * 31 * <p>Characters with accents or other adornments can be encoded in 32 * several different ways in Unicode. For example, take the character A-acute. 33 * In Unicode, this can be encoded as a single character (the 34 * "composed" form): 35 * 36 * <pre> 37 * 00C1 LATIN CAPITAL LETTER A WITH ACUTE 38 * </pre> 39 * 40 * or as two separate characters (the "decomposed" form): 41 * 42 * <pre> 43 * 0041 LATIN CAPITAL LETTER A 44 * 0301 COMBINING ACUTE ACCENT 45 * </pre> 46 * 47 * <p>To a user of your program, however, both of these sequences should be 48 * treated as the same "user-level" character "A with acute accent". When you 49 * are searching or comparing text, you must ensure that these two sequences are 50 * treated equivalently. In addition, you must handle characters with more than 51 * one accent. Sometimes the order of a character's combining accents is 52 * significant, while in other cases accent sequences in different orders are 53 * really equivalent. 54 * 55 * <p>Similarly, the string "ffi" can be encoded as three separate letters: 56 * 57 * <pre> 58 * 0066 LATIN SMALL LETTER F 59 * 0066 LATIN SMALL LETTER F 60 * 0069 LATIN SMALL LETTER I 61 * </pre> 62 * 63 * or as the single character 64 * 65 * <pre> 66 * FB03 LATIN SMALL LIGATURE FFI 67 * </pre> 68 * 69 * <p>The ffi ligature is not a distinct semantic character, and strictly speaking 70 * it shouldn't be in Unicode at all, but it was included for compatibility 71 * with existing character sets that already provided it. The Unicode standard 72 * identifies such characters by giving them "compatibility" decompositions 73 * into the corresponding semantic characters. When sorting and searching, you 74 * will often want to use these mappings. 75 * 76 * <p><code>normalize</code> helps solve these problems by transforming text into 77 * the canonical composed and decomposed forms as shown in the first example 78 * above. In addition, you can have it perform compatibility decompositions so 79 * that you can treat compatibility characters the same as their equivalents. 80 * Finally, <code>normalize</code> rearranges accents into the proper canonical 81 * order, so that you do not have to worry about accent rearrangement on your 82 * own. 83 * 84 * <p>Form FCD, "Fast C or D", is also designed for collation. 85 * It allows to work on strings that are not necessarily normalized 86 * with an algorithm (like in collation) that works under "canonical closure", 87 * i.e., it treats precomposed characters and their decomposed equivalents the 88 * same. 89 * 90 * <p>It is not a normalization form because it does not provide for uniqueness of 91 * representation. Multiple strings may be canonically equivalent (their NFDs 92 * are identical) and may all conform to FCD without being identical themselves. 93 * 94 * <p>The form is defined such that the "raw decomposition", the recursive 95 * canonical decomposition of each character, results in a string that is 96 * canonically ordered. This means that precomposed characters are allowed for 97 * as long as their decompositions do not need canonical reordering. 98 * 99 * <p>Its advantage for a process like collation is that all NFD and most NFC texts 100 * - and many unnormalized texts - already conform to FCD and do not need to be 101 * normalized (NFD) for such a process. The FCD quick check will return YES for 102 * most strings in practice. 103 * 104 * <p>normalize(FCD) may be implemented with NFD. 105 * 106 * <p>For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): 107 * http://www.unicode.org/notes/tn5/#FCD 108 * 109 * <p>ICU collation performs either NFD or FCD normalization automatically if 110 * normalization is turned on for the collator object. Beyond collation and 111 * string search, normalized strings may be useful for string equivalence 112 * comparisons, transliteration/transcription, unique representations, etc. 113 * 114 * <p>The W3C generally recommends to exchange texts in NFC. 115 * Note also that most legacy character encodings use only precomposed forms and 116 * often do not encode any combining marks by themselves. For conversion to such 117 * character encodings the Unicode text needs to be normalized to NFC. 118 * For more usage examples, see the Unicode Standard Annex. 119 * 120 * <p>Note: The Normalizer class also provides API for iterative normalization. 121 * While the setIndex() and getIndex() refer to indices in the 122 * underlying Unicode input text, the next() and previous() methods 123 * iterate through characters in the normalized output. 124 * This means that there is not necessarily a one-to-one correspondence 125 * between characters returned by next() and previous() and the indices 126 * passed to and returned from setIndex() and getIndex(). 127 * It is for this reason that Normalizer does not implement the CharacterIterator interface. 128 * 129 * @stable ICU 2.8 130 */ 131 public final class Normalizer implements Cloneable { 132 // The input text and our position in it 133 private UCharacterIterator text; 134 private Normalizer2 norm2; 135 private Mode mode; 136 private int options; 137 138 // The normalization buffer is the result of normalization 139 // of the source in [currentIndex..nextIndex[ . 140 private int currentIndex; 141 private int nextIndex; 142 143 // A buffer for holding intermediate results 144 private StringBuilder buffer; 145 private int bufferPos; 146 147 // Helper classes to defer loading of normalization data. 148 private static final class ModeImpl { ModeImpl(Normalizer2 n2)149 private ModeImpl(Normalizer2 n2) { 150 normalizer2 = n2; 151 } 152 private final Normalizer2 normalizer2; 153 } 154 private static final class NFDModeImpl { 155 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); 156 } 157 private static final class NFKDModeImpl { 158 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); 159 } 160 private static final class NFCModeImpl { 161 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); 162 } 163 private static final class NFKCModeImpl { 164 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); 165 } 166 private static final class FCDModeImpl { 167 private static final ModeImpl INSTANCE = new ModeImpl(Norm2AllModes.getFCDNormalizer2()); 168 } 169 170 private static final class Unicode32 { 171 private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); 172 } 173 private static final class NFD32ModeImpl { 174 private static final ModeImpl INSTANCE = 175 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), 176 Unicode32.INSTANCE)); 177 } 178 private static final class NFKD32ModeImpl { 179 private static final ModeImpl INSTANCE = 180 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), 181 Unicode32.INSTANCE)); 182 } 183 private static final class NFC32ModeImpl { 184 private static final ModeImpl INSTANCE = 185 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), 186 Unicode32.INSTANCE)); 187 } 188 private static final class NFKC32ModeImpl { 189 private static final ModeImpl INSTANCE = 190 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), 191 Unicode32.INSTANCE)); 192 } 193 private static final class FCD32ModeImpl { 194 private static final ModeImpl INSTANCE = 195 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(), 196 Unicode32.INSTANCE)); 197 } 198 199 /** 200 * Options bit set value to select Unicode 3.2 normalization 201 * (except NormalizationCorrections). 202 * At most one Unicode version can be selected at a time. 203 * 204 * @deprecated ICU 56 Use {@link FilteredNormalizer2} instead. 205 */ 206 @Deprecated 207 public static final int UNICODE_3_2=0x20; 208 209 /** 210 * Constant indicating that the end of the iteration has been reached. 211 * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. 212 * 213 * @deprecated ICU 56 214 */ 215 @Deprecated 216 public static final int DONE = UCharacterIterator.DONE; 217 218 /** 219 * Constants for normalization modes. 220 * <p> 221 * The Mode class is not intended for public subclassing. 222 * Only the Mode constants provided by the Normalizer class should be used, 223 * and any fields or methods should not be called or overridden by users. 224 * 225 * @deprecated ICU 56 Use {@link Normalizer2} instead. 226 */ 227 @Deprecated 228 public static abstract class Mode { 229 /** 230 * Sole constructor 231 * @internal 232 * @deprecated This API is ICU internal only. 233 */ 234 @Deprecated Mode()235 protected Mode() { 236 } 237 238 /** 239 * @internal 240 * @deprecated This API is ICU internal only. 241 */ 242 @Deprecated getNormalizer2(int options)243 protected abstract Normalizer2 getNormalizer2(int options); 244 } 245 246 private static final class NONEMode extends Mode { getNormalizer2(int options)247 protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } 248 } 249 private static final class NFDMode extends Mode { getNormalizer2(int options)250 protected Normalizer2 getNormalizer2(int options) { 251 return (options&UNICODE_3_2) != 0 ? 252 NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2; 253 } 254 } 255 private static final class NFKDMode extends Mode { getNormalizer2(int options)256 protected Normalizer2 getNormalizer2(int options) { 257 return (options&UNICODE_3_2) != 0 ? 258 NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2; 259 } 260 } 261 private static final class NFCMode extends Mode { getNormalizer2(int options)262 protected Normalizer2 getNormalizer2(int options) { 263 return (options&UNICODE_3_2) != 0 ? 264 NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2; 265 } 266 } 267 private static final class NFKCMode extends Mode { getNormalizer2(int options)268 protected Normalizer2 getNormalizer2(int options) { 269 return (options&UNICODE_3_2) != 0 ? 270 NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2; 271 } 272 } 273 private static final class FCDMode extends Mode { getNormalizer2(int options)274 protected Normalizer2 getNormalizer2(int options) { 275 return (options&UNICODE_3_2) != 0 ? 276 FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2; 277 } 278 } 279 280 /** 281 * No decomposition/composition. 282 * 283 * @deprecated ICU 56 Use {@link Normalizer2} instead. 284 */ 285 @Deprecated 286 public static final Mode NONE = new NONEMode(); 287 288 /** 289 * Canonical decomposition. 290 * 291 * @deprecated ICU 56 Use {@link Normalizer2} instead. 292 */ 293 @Deprecated 294 public static final Mode NFD = new NFDMode(); 295 296 /** 297 * Compatibility decomposition. 298 * 299 * @deprecated ICU 56 Use {@link Normalizer2} instead. 300 */ 301 @Deprecated 302 public static final Mode NFKD = new NFKDMode(); 303 304 /** 305 * Canonical decomposition followed by canonical composition. 306 * 307 * @deprecated ICU 56 Use {@link Normalizer2} instead. 308 */ 309 @Deprecated 310 public static final Mode NFC = new NFCMode(); 311 312 /** 313 * Default normalization. 314 * 315 * @deprecated ICU 56 Use {@link Normalizer2} instead. 316 */ 317 @Deprecated 318 public static final Mode DEFAULT = NFC; 319 320 /** 321 * Compatibility decomposition followed by canonical composition. 322 * 323 * @deprecated ICU 56 Use {@link Normalizer2} instead. 324 */ 325 @Deprecated 326 public static final Mode NFKC =new NFKCMode(); 327 328 /** 329 * "Fast C or D" form. 330 * 331 * @deprecated ICU 56 Use {@link Normalizer2} instead. 332 */ 333 @Deprecated 334 public static final Mode FCD = new FCDMode(); 335 336 /** 337 * Null operation for use with the {@link com.ibm.icu.text.Normalizer constructors} 338 * and the static {@link #normalize normalize} method. This value tells 339 * the <tt>Normalizer</tt> to do nothing but return unprocessed characters 340 * from the underlying String or CharacterIterator. If you have code which 341 * requires raw text at some times and normalized text at others, you can 342 * use <tt>NO_OP</tt> for the cases where you want raw text, rather 343 * than having a separate code path that bypasses <tt>Normalizer</tt> 344 * altogether. 345 * <p> 346 * @see #setMode 347 * @deprecated ICU 2.8. Use Nomalizer.NONE 348 * @see #NONE 349 */ 350 @Deprecated 351 public static final Mode NO_OP = NONE; 352 353 /** 354 * Canonical decomposition followed by canonical composition. Used with the 355 * {@link com.ibm.icu.text.Normalizer constructors} and the static 356 * {@link #normalize normalize} method to determine the operation to be 357 * performed. 358 * <p> 359 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 360 * off, this operation produces output that is in 361 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 362 * Form</a> 363 * <b>C</b>. 364 * <p> 365 * @see #setMode 366 * @deprecated ICU 2.8. Use Normalier.NFC 367 * @see #NFC 368 */ 369 @Deprecated 370 public static final Mode COMPOSE = NFC; 371 372 /** 373 * Compatibility decomposition followed by canonical composition. 374 * Used with the {@link com.ibm.icu.text.Normalizer constructors} and the static 375 * {@link #normalize normalize} method to determine the operation to be 376 * performed. 377 * <p> 378 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 379 * off, this operation produces output that is in 380 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 381 * Form</a> 382 * <b>KC</b>. 383 * <p> 384 * @see #setMode 385 * @deprecated ICU 2.8. Use Normalizer.NFKC 386 * @see #NFKC 387 */ 388 @Deprecated 389 public static final Mode COMPOSE_COMPAT = NFKC; 390 391 /** 392 * Canonical decomposition. This value is passed to the 393 * {@link com.ibm.icu.text.Normalizer constructors} and the static 394 * {@link #normalize normalize} 395 * method to determine the operation to be performed. 396 * <p> 397 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 398 * off, this operation produces output that is in 399 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 400 * Form</a> 401 * <b>D</b>. 402 * <p> 403 * @see #setMode 404 * @deprecated ICU 2.8. Use Normalizer.NFD 405 * @see #NFD 406 */ 407 @Deprecated 408 public static final Mode DECOMP = NFD; 409 410 /** 411 * Compatibility decomposition. This value is passed to the 412 * {@link com.ibm.icu.text.Normalizer constructors} and the static 413 * {@link #normalize normalize} 414 * method to determine the operation to be performed. 415 * <p> 416 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 417 * off, this operation produces output that is in 418 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 419 * Form</a> 420 * <b>KD</b>. 421 * <p> 422 * @see #setMode 423 * @deprecated ICU 2.8. Use Normalizer.NFKD 424 * @see #NFKD 425 */ 426 @Deprecated 427 public static final Mode DECOMP_COMPAT = NFKD; 428 429 /** 430 * Option to disable Hangul/Jamo composition and decomposition. 431 * This option applies to Korean text, 432 * which can be represented either in the Jamo alphabet or in Hangul 433 * characters, which are really just two or three Jamo combined 434 * into one visual glyph. Since Jamo takes up more storage space than 435 * Hangul, applications that process only Hangul text may wish to turn 436 * this option on when decomposing text. 437 * <p> 438 * The Unicode standard treates Hangul to Jamo conversion as a 439 * canonical decomposition, so this option must be turned <b>off</b> if you 440 * wish to transform strings into one of the standard 441 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> 442 * Unicode Normalization Forms</a>. 443 * <p> 444 * @see #setOption 445 * @deprecated ICU 2.8. This option is no longer supported. 446 */ 447 @Deprecated 448 public static final int IGNORE_HANGUL = 0x0001; 449 450 /** 451 * Result values for quickCheck(). 452 * For details see Unicode Technical Report 15. 453 * @stable ICU 2.8 454 */ 455 public static final class QuickCheckResult{ 456 //private int resultValue; QuickCheckResult(int value)457 private QuickCheckResult(int value) { 458 //resultValue=value; 459 } 460 } 461 /** 462 * Indicates that string is not in the normalized format 463 * @stable ICU 2.8 464 */ 465 public static final QuickCheckResult NO = new QuickCheckResult(0); 466 467 /** 468 * Indicates that string is in the normalized format 469 * @stable ICU 2.8 470 */ 471 public static final QuickCheckResult YES = new QuickCheckResult(1); 472 473 /** 474 * Indicates it cannot be determined if string is in the normalized 475 * format without further thorough checks. 476 * @stable ICU 2.8 477 */ 478 public static final QuickCheckResult MAYBE = new QuickCheckResult(2); 479 480 /** 481 * Option bit for compare: 482 * Case sensitively compare the strings 483 * @stable ICU 2.8 484 */ 485 public static final int FOLD_CASE_DEFAULT = UCharacter.FOLD_CASE_DEFAULT; 486 487 /** 488 * Option bit for compare: 489 * Both input strings are assumed to fulfill FCD conditions. 490 * @stable ICU 2.8 491 */ 492 public static final int INPUT_IS_FCD = 0x20000; 493 494 /** 495 * Option bit for compare: 496 * Perform case-insensitive comparison. 497 * @stable ICU 2.8 498 */ 499 public static final int COMPARE_IGNORE_CASE = 0x10000; 500 501 /** 502 * Option bit for compare: 503 * Compare strings in code point order instead of code unit order. 504 * @stable ICU 2.8 505 */ 506 public static final int COMPARE_CODE_POINT_ORDER = 0x8000; 507 508 /** 509 * Option value for case folding: 510 * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I 511 * and dotless i appropriately for Turkic languages (tr, az). 512 * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I 513 * @stable ICU 2.8 514 */ 515 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I; 516 517 /** 518 * Lowest-order bit number of compare() options bits corresponding to 519 * normalization options bits. 520 * 521 * The options parameter for compare() uses most bits for 522 * itself and for various comparison and folding flags. 523 * The most significant bits, however, are shifted down and passed on 524 * to the normalization implementation. 525 * (That is, from compare(..., options, ...), 526 * options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the 527 * internal normalization functions.) 528 * 529 * @see #compare 530 * @deprecated ICU 56 Use {@link Normalizer2} instead. 531 */ 532 @Deprecated 533 public static final int COMPARE_NORM_OPTIONS_SHIFT = 20; 534 535 //------------------------------------------------------------------------- 536 // Iterator constructors 537 //------------------------------------------------------------------------- 538 539 /** 540 * Creates a new <tt>Normalizer</tt> object for iterating over the 541 * normalized form of a given string. 542 * <p> 543 * The <tt>options</tt> parameter specifies which optional 544 * <tt>Normalizer</tt> features are to be enabled for this object. 545 * <p> 546 * @param str The string to be normalized. The normalization 547 * will start at the beginning of the string. 548 * 549 * @param mode The normalization mode. 550 * 551 * @param opt Any optional features to be enabled. 552 * Currently the only available option is {@link #UNICODE_3_2}. 553 * If you want the default behavior corresponding to one of the 554 * standard Unicode Normalization Forms, use 0 for this argument. 555 * @deprecated ICU 56 Use {@link Normalizer2} instead. 556 */ 557 @Deprecated Normalizer(String str, Mode mode, int opt)558 public Normalizer(String str, Mode mode, int opt) { 559 this.text = UCharacterIterator.getInstance(str); 560 this.mode = mode; 561 this.options=opt; 562 norm2 = mode.getNormalizer2(opt); 563 buffer = new StringBuilder(); 564 } 565 566 /** 567 * Creates a new <tt>Normalizer</tt> object for iterating over the 568 * normalized form of the given text. 569 * <p> 570 * @param iter The input text to be normalized. The normalization 571 * will start at the beginning of the string. 572 * 573 * @param mode The normalization mode. 574 * 575 * @param opt Any optional features to be enabled. 576 * Currently the only available option is {@link #UNICODE_3_2}. 577 * If you want the default behavior corresponding to one of the 578 * standard Unicode Normalization Forms, use 0 for this argument. 579 * @deprecated ICU 56 Use {@link Normalizer2} instead. 580 */ 581 @Deprecated Normalizer(CharacterIterator iter, Mode mode, int opt)582 public Normalizer(CharacterIterator iter, Mode mode, int opt) { 583 this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); 584 this.mode = mode; 585 this.options = opt; 586 norm2 = mode.getNormalizer2(opt); 587 buffer = new StringBuilder(); 588 } 589 590 /** 591 * Creates a new <tt>Normalizer</tt> object for iterating over the 592 * normalized form of the given text. 593 * <p> 594 * @param iter The input text to be normalized. The normalization 595 * will start at the beginning of the string. 596 * 597 * @param mode The normalization mode. 598 * @param options The normalization options, ORed together (0 for no options). 599 * @deprecated ICU 56 Use {@link Normalizer2} instead. 600 */ 601 @Deprecated Normalizer(UCharacterIterator iter, Mode mode, int options)602 public Normalizer(UCharacterIterator iter, Mode mode, int options) { 603 try { 604 this.text = (UCharacterIterator)iter.clone(); 605 this.mode = mode; 606 this.options = options; 607 norm2 = mode.getNormalizer2(options); 608 buffer = new StringBuilder(); 609 } catch (CloneNotSupportedException e) { 610 throw new ICUCloneNotSupportedException(e); 611 } 612 } 613 614 /** 615 * Clones this <tt>Normalizer</tt> object. All properties of this 616 * object are duplicated in the new object, including the cloning of any 617 * {@link CharacterIterator} that was passed in to the constructor 618 * or to {@link #setText(CharacterIterator) setText}. 619 * However, the text storage underlying 620 * the <tt>CharacterIterator</tt> is not duplicated unless the 621 * iterator's <tt>clone</tt> method does so. 622 * 623 * @deprecated ICU 56 Use {@link Normalizer2} instead. 624 */ 625 @Deprecated 626 @Override clone()627 public Object clone() { 628 try { 629 Normalizer copy = (Normalizer) super.clone(); 630 copy.text = (UCharacterIterator) text.clone(); 631 copy.mode = mode; 632 copy.options = options; 633 copy.norm2 = norm2; 634 copy.buffer = new StringBuilder(buffer); 635 copy.bufferPos = bufferPos; 636 copy.currentIndex = currentIndex; 637 copy.nextIndex = nextIndex; 638 return copy; 639 } 640 catch (CloneNotSupportedException e) { 641 throw new ICUCloneNotSupportedException(e); 642 } 643 } 644 645 //-------------------------------------------------------------------------- 646 // Static Utility methods 647 //-------------------------------------------------------------------------- 648 getComposeNormalizer2(boolean compat, int options)649 private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) { 650 return (compat ? NFKC : NFC).getNormalizer2(options); 651 } getDecomposeNormalizer2(boolean compat, int options)652 private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) { 653 return (compat ? NFKD : NFD).getNormalizer2(options); 654 } 655 656 /** 657 * Compose a string. 658 * The string will be composed to according to the specified mode. 659 * @param str The string to compose. 660 * @param compat If true the string will be composed according to 661 * NFKC rules and if false will be composed according to 662 * NFC rules. 663 * @return String The composed string 664 * @deprecated ICU 56 Use {@link Normalizer2} instead. 665 */ 666 @Deprecated compose(String str, boolean compat)667 public static String compose(String str, boolean compat) { 668 return compose(str,compat,0); 669 } 670 671 /** 672 * Compose a string. 673 * The string will be composed to according to the specified mode. 674 * @param str The string to compose. 675 * @param compat If true the string will be composed according to 676 * NFKC rules and if false will be composed according to 677 * NFC rules. 678 * @param options The only recognized option is UNICODE_3_2 679 * @return String The composed string 680 * @deprecated ICU 56 Use {@link Normalizer2} instead. 681 */ 682 @Deprecated compose(String str, boolean compat, int options)683 public static String compose(String str, boolean compat, int options) { 684 return getComposeNormalizer2(compat, options).normalize(str); 685 } 686 687 /** 688 * Compose a string. 689 * The string will be composed to according to the specified mode. 690 * @param source The char array to compose. 691 * @param target A char buffer to receive the normalized text. 692 * @param compat If true the char array will be composed according to 693 * NFKC rules and if false will be composed according to 694 * NFC rules. 695 * @param options The normalization options, ORed together (0 for no options). 696 * @return int The total buffer size needed;if greater than length of 697 * result, the output was truncated. 698 * @exception IndexOutOfBoundsException if target.length is less than the 699 * required length 700 * @deprecated ICU 56 Use {@link Normalizer2} instead. 701 */ 702 @Deprecated compose(char[] source,char[] target, boolean compat, int options)703 public static int compose(char[] source,char[] target, boolean compat, int options) { 704 return compose(source, 0, source.length, target, 0, target.length, compat, options); 705 } 706 707 /** 708 * Compose a string. 709 * The string will be composed to according to the specified mode. 710 * @param src The char array to compose. 711 * @param srcStart Start index of the source 712 * @param srcLimit Limit index of the source 713 * @param dest The char buffer to fill in 714 * @param destStart Start index of the destination buffer 715 * @param destLimit End index of the destination buffer 716 * @param compat If true the char array will be composed according to 717 * NFKC rules and if false will be composed according to 718 * NFC rules. 719 * @param options The normalization options, ORed together (0 for no options). 720 * @return int The total buffer size needed;if greater than length of 721 * result, the output was truncated. 722 * @exception IndexOutOfBoundsException if target.length is less than the 723 * required length 724 * @deprecated ICU 56 Use {@link Normalizer2} instead. 725 */ 726 @Deprecated compose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)727 public static int compose(char[] src,int srcStart, int srcLimit, 728 char[] dest,int destStart, int destLimit, 729 boolean compat, int options) { 730 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 731 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 732 getComposeNormalizer2(compat, options).normalize(srcBuffer, app); 733 return app.length(); 734 } 735 736 /** 737 * Decompose a string. 738 * The string will be decomposed to according to the specified mode. 739 * @param str The string to decompose. 740 * @param compat If true the string will be decomposed according to NFKD 741 * rules and if false will be decomposed according to NFD 742 * rules. 743 * @return String The decomposed string 744 * @deprecated ICU 56 Use {@link Normalizer2} instead. 745 */ 746 @Deprecated decompose(String str, boolean compat)747 public static String decompose(String str, boolean compat) { 748 return decompose(str,compat,0); 749 } 750 751 /** 752 * Decompose a string. 753 * The string will be decomposed to according to the specified mode. 754 * @param str The string to decompose. 755 * @param compat If true the string will be decomposed according to NFKD 756 * rules and if false will be decomposed according to NFD 757 * rules. 758 * @param options The normalization options, ORed together (0 for no options). 759 * @return String The decomposed string 760 * @deprecated ICU 56 Use {@link Normalizer2} instead. 761 */ 762 @Deprecated decompose(String str, boolean compat, int options)763 public static String decompose(String str, boolean compat, int options) { 764 return getDecomposeNormalizer2(compat, options).normalize(str); 765 } 766 767 /** 768 * Decompose a string. 769 * The string will be decomposed to according to the specified mode. 770 * @param source The char array to decompose. 771 * @param target A char buffer to receive the normalized text. 772 * @param compat If true the char array will be decomposed according to NFKD 773 * rules and if false will be decomposed according to 774 * NFD rules. 775 * @return int The total buffer size needed;if greater than length of 776 * result,the output was truncated. 777 * @param options The normalization options, ORed together (0 for no options). 778 * @exception IndexOutOfBoundsException if the target capacity is less than 779 * the required length 780 * @deprecated ICU 56 Use {@link Normalizer2} instead. 781 */ 782 @Deprecated decompose(char[] source,char[] target, boolean compat, int options)783 public static int decompose(char[] source,char[] target, boolean compat, int options) { 784 return decompose(source, 0, source.length, target, 0, target.length, compat, options); 785 } 786 787 /** 788 * Decompose a string. 789 * The string will be decomposed to according to the specified mode. 790 * @param src The char array to compose. 791 * @param srcStart Start index of the source 792 * @param srcLimit Limit index of the source 793 * @param dest The char buffer to fill in 794 * @param destStart Start index of the destination buffer 795 * @param destLimit End index of the destination buffer 796 * @param compat If true the char array will be decomposed according to NFKD 797 * rules and if false will be decomposed according to 798 * NFD rules. 799 * @param options The normalization options, ORed together (0 for no options). 800 * @return int The total buffer size needed;if greater than length of 801 * result,the output was truncated. 802 * @exception IndexOutOfBoundsException if the target capacity is less than 803 * the required length 804 * @deprecated ICU 56 Use {@link Normalizer2} instead. 805 */ 806 @Deprecated decompose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)807 public static int decompose(char[] src,int srcStart, int srcLimit, 808 char[] dest,int destStart, int destLimit, 809 boolean compat, int options) { 810 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 811 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 812 getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app); 813 return app.length(); 814 } 815 816 /** 817 * Normalizes a <tt>String</tt> using the given normalization operation. 818 * <p> 819 * The <tt>options</tt> parameter specifies which optional 820 * <tt>Normalizer</tt> features are to be enabled for this operation. 821 * Currently the only available option is {@link #UNICODE_3_2}. 822 * If you want the default behavior corresponding to one of the standard 823 * Unicode Normalization Forms, use 0 for this argument. 824 * <p> 825 * @param str the input string to be normalized. 826 * @param mode the normalization mode 827 * @param options the optional features to be enabled. 828 * @return String the normalized string 829 * @deprecated ICU 56 Use {@link Normalizer2} instead. 830 */ 831 @Deprecated normalize(String str, Mode mode, int options)832 public static String normalize(String str, Mode mode, int options) { 833 return mode.getNormalizer2(options).normalize(str); 834 } 835 836 /** 837 * Normalize a string. 838 * The string will be normalized according to the specified normalization 839 * mode and options. 840 * @param src The string to normalize. 841 * @param mode The normalization mode; one of Normalizer.NONE, 842 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 843 * Normalizer.NFKD, Normalizer.DEFAULT 844 * @return the normalized string 845 * @deprecated ICU 56 Use {@link Normalizer2} instead. 846 */ 847 @Deprecated normalize(String src,Mode mode)848 public static String normalize(String src,Mode mode) { 849 return normalize(src, mode, 0); 850 } 851 /** 852 * Normalize a string. 853 * The string will be normalized according to the specified normalization 854 * mode and options. 855 * @param source The char array to normalize. 856 * @param target A char buffer to receive the normalized text. 857 * @param mode The normalization mode; one of Normalizer.NONE, 858 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 859 * Normalizer.NFKD, Normalizer.DEFAULT 860 * @param options The normalization options, ORed together (0 for no options). 861 * @return int The total buffer size needed;if greater than length of 862 * result, the output was truncated. 863 * @exception IndexOutOfBoundsException if the target capacity is less 864 * than the required length 865 * @deprecated ICU 56 Use {@link Normalizer2} instead. 866 */ 867 @Deprecated normalize(char[] source,char[] target, Mode mode, int options)868 public static int normalize(char[] source,char[] target, Mode mode, int options) { 869 return normalize(source,0,source.length,target,0,target.length,mode, options); 870 } 871 872 /** 873 * Normalize a string. 874 * The string will be normalized according to the specified normalization 875 * mode and options. 876 * @param src The char array to compose. 877 * @param srcStart Start index of the source 878 * @param srcLimit Limit index of the source 879 * @param dest The char buffer to fill in 880 * @param destStart Start index of the destination buffer 881 * @param destLimit End index of the destination buffer 882 * @param mode The normalization mode; one of Normalizer.NONE, 883 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 884 * Normalizer.NFKD, Normalizer.DEFAULT 885 * @param options The normalization options, ORed together (0 for no options). 886 * @return int The total buffer size needed;if greater than length of 887 * result, the output was truncated. 888 * @exception IndexOutOfBoundsException if the target capacity is 889 * less than the required length 890 * @deprecated ICU 56 Use {@link Normalizer2} instead. 891 */ 892 @Deprecated normalize(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, Mode mode, int options)893 public static int normalize(char[] src,int srcStart, int srcLimit, 894 char[] dest,int destStart, int destLimit, 895 Mode mode, int options) { 896 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 897 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 898 mode.getNormalizer2(options).normalize(srcBuffer, app); 899 return app.length(); 900 } 901 902 /** 903 * Normalize a codepoint according to the given mode 904 * @param char32 The input string to be normalized. 905 * @param mode The normalization mode 906 * @param options Options for use with exclusion set and tailored Normalization 907 * The only option that is currently recognized is UNICODE_3_2 908 * @return String The normalized string 909 * @see #UNICODE_3_2 910 * @deprecated ICU 56 Use {@link Normalizer2} instead. 911 */ 912 @Deprecated normalize(int char32, Mode mode, int options)913 public static String normalize(int char32, Mode mode, int options) { 914 if(mode == NFD && options == 0) { 915 String decomposition = Normalizer2.getNFCInstance().getDecomposition(char32); 916 if(decomposition == null) { 917 decomposition = UTF16.valueOf(char32); 918 } 919 return decomposition; 920 } 921 return normalize(UTF16.valueOf(char32), mode, options); 922 } 923 924 /** 925 * Convenience method to normalize a codepoint according to the given mode 926 * @param char32 The input string to be normalized. 927 * @param mode The normalization mode 928 * @return String The normalized string 929 * @deprecated ICU 56 Use {@link Normalizer2} instead. 930 */ 931 @Deprecated normalize(int char32, Mode mode)932 public static String normalize(int char32, Mode mode) { 933 return normalize(char32, mode, 0); 934 } 935 936 /** 937 * Convenience method. 938 * 939 * @param source string for determining if it is in a normalized format 940 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 941 * Normalizer.NFKC,Normalizer.NFKD) 942 * @return Return code to specify if the text is normalized or not 943 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 944 * @deprecated ICU 56 Use {@link Normalizer2} instead. 945 */ 946 @Deprecated quickCheck(String source, Mode mode)947 public static QuickCheckResult quickCheck(String source, Mode mode) { 948 return quickCheck(source, mode, 0); 949 } 950 951 /** 952 * Performing quick check on a string, to quickly determine if the string is 953 * in a particular normalization format. 954 * Three types of result can be returned Normalizer.YES, Normalizer.NO or 955 * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument 956 * string is in the desired normalized format, Normalizer.NO determines that 957 * argument string is not in the desired normalized format. A 958 * Normalizer.MAYBE result indicates that a more thorough check is required, 959 * the user may have to put the string in its normalized form and compare 960 * the results. 961 * 962 * @param source string for determining if it is in a normalized format 963 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 964 * Normalizer.NFKC,Normalizer.NFKD) 965 * @param options Options for use with exclusion set and tailored Normalization 966 * The only option that is currently recognized is UNICODE_3_2 967 * @return Return code to specify if the text is normalized or not 968 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 969 * @deprecated ICU 56 Use {@link Normalizer2} instead. 970 */ 971 @Deprecated quickCheck(String source, Mode mode, int options)972 public static QuickCheckResult quickCheck(String source, Mode mode, int options) { 973 return mode.getNormalizer2(options).quickCheck(source); 974 } 975 976 /** 977 * Convenience method. 978 * 979 * @param source Array of characters for determining if it is in a 980 * normalized format 981 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 982 * Normalizer.NFKC,Normalizer.NFKD) 983 * @param options Options for use with exclusion set and tailored Normalization 984 * The only option that is currently recognized is UNICODE_3_2 985 * @return Return code to specify if the text is normalized or not 986 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 987 * @deprecated ICU 56 Use {@link Normalizer2} instead. 988 */ 989 @Deprecated quickCheck(char[] source, Mode mode, int options)990 public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) { 991 return quickCheck(source, 0, source.length, mode, options); 992 } 993 994 /** 995 * Performing quick check on a string, to quickly determine if the string is 996 * in a particular normalization format. 997 * Three types of result can be returned Normalizer.YES, Normalizer.NO or 998 * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument 999 * string is in the desired normalized format, Normalizer.NO determines that 1000 * argument string is not in the desired normalized format. A 1001 * Normalizer.MAYBE result indicates that a more thorough check is required, 1002 * the user may have to put the string in its normalized form and compare 1003 * the results. 1004 * 1005 * @param source string for determining if it is in a normalized format 1006 * @param start the start index of the source 1007 * @param limit the limit index of the source it is equal to the length 1008 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 1009 * Normalizer.NFKC,Normalizer.NFKD) 1010 * @param options Options for use with exclusion set and tailored Normalization 1011 * The only option that is currently recognized is UNICODE_3_2 1012 * @return Return code to specify if the text is normalized or not 1013 * (Normalizer.YES, Normalizer.NO or 1014 * Normalizer.MAYBE) 1015 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1016 */ 1017 @Deprecated quickCheck(char[] source,int start, int limit, Mode mode,int options)1018 public static QuickCheckResult quickCheck(char[] source,int start, 1019 int limit, Mode mode,int options) { 1020 CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start); 1021 return mode.getNormalizer2(options).quickCheck(srcBuffer); 1022 } 1023 1024 /** 1025 * Test if a string is in a given normalization form. 1026 * This is semantically equivalent to source.equals(normalize(source, mode)). 1027 * 1028 * Unlike quickCheck(), this function returns a definitive result, 1029 * never a "maybe". 1030 * For NFD, NFKD, and FCD, both functions work exactly the same. 1031 * For NFC and NFKC where quickCheck may return "maybe", this function will 1032 * perform further tests to arrive at a true/false result. 1033 * @param src The input array of characters to be checked to see if 1034 * it is normalized 1035 * @param start The strart index in the source 1036 * @param limit The limit index in the source 1037 * @param mode the normalization mode 1038 * @param options Options for use with exclusion set and tailored Normalization 1039 * The only option that is currently recognized is UNICODE_3_2 1040 * @return Boolean value indicating whether the source string is in the 1041 * "mode" normalization form 1042 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1043 */ 1044 @Deprecated isNormalized(char[] src,int start, int limit, Mode mode, int options)1045 public static boolean isNormalized(char[] src,int start, 1046 int limit, Mode mode, 1047 int options) { 1048 CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start); 1049 return mode.getNormalizer2(options).isNormalized(srcBuffer); 1050 } 1051 1052 /** 1053 * Test if a string is in a given normalization form. 1054 * This is semantically equivalent to source.equals(normalize(source, mode)). 1055 * 1056 * Unlike quickCheck(), this function returns a definitive result, 1057 * never a "maybe". 1058 * For NFD, NFKD, and FCD, both functions work exactly the same. 1059 * For NFC and NFKC where quickCheck may return "maybe", this function will 1060 * perform further tests to arrive at a true/false result. 1061 * @param str the input string to be checked to see if it is 1062 * normalized 1063 * @param mode the normalization mode 1064 * @param options Options for use with exclusion set and tailored Normalization 1065 * The only option that is currently recognized is UNICODE_3_2 1066 * @see #isNormalized 1067 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1068 */ 1069 @Deprecated isNormalized(String str, Mode mode, int options)1070 public static boolean isNormalized(String str, Mode mode, int options) { 1071 return mode.getNormalizer2(options).isNormalized(str); 1072 } 1073 1074 /** 1075 * Convenience Method 1076 * @param char32 the input code point to be checked to see if it is 1077 * normalized 1078 * @param mode the normalization mode 1079 * @param options Options for use with exclusion set and tailored Normalization 1080 * The only option that is currently recognized is UNICODE_3_2 1081 * 1082 * @see #isNormalized 1083 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1084 */ 1085 @Deprecated isNormalized(int char32, Mode mode,int options)1086 public static boolean isNormalized(int char32, Mode mode,int options) { 1087 return isNormalized(UTF16.valueOf(char32), mode, options); 1088 } 1089 1090 /** 1091 * Compare two strings for canonical equivalence. 1092 * Further options include case-insensitive comparison and 1093 * code point order (as opposed to code unit order). 1094 * 1095 * Canonical equivalence between two strings is defined as their normalized 1096 * forms (NFD or NFC) being identical. 1097 * This function compares strings incrementally instead of normalizing 1098 * (and optionally case-folding) both strings entirely, 1099 * improving performance significantly. 1100 * 1101 * Bulk normalization is only necessary if the strings do not fulfill the 1102 * FCD conditions. Only in this case, and only if the strings are relatively 1103 * long, is memory allocated temporarily. 1104 * For FCD strings and short non-FCD strings there is no memory allocation. 1105 * 1106 * Semantically, this is equivalent to 1107 * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2))) 1108 * where code point order and foldCase are all optional. 1109 * 1110 * @param s1 First source character array. 1111 * @param s1Start start index of source 1112 * @param s1Limit limit of the source 1113 * 1114 * @param s2 Second source character array. 1115 * @param s2Start start index of the source 1116 * @param s2Limit limit of the source 1117 * 1118 * @param options A bit set of options: 1119 * - FOLD_CASE_DEFAULT or 0 is used for default options: 1120 * Case-sensitive comparison in code unit order, and the input strings 1121 * are quick-checked for FCD. 1122 * 1123 * - INPUT_IS_FCD 1124 * Set if the caller knows that both s1 and s2 fulfill the FCD 1125 * conditions.If not set, the function will quickCheck for FCD 1126 * and normalize if necessary. 1127 * 1128 * - COMPARE_CODE_POINT_ORDER 1129 * Set to choose code point order instead of code unit order 1130 * 1131 * - COMPARE_IGNORE_CASE 1132 * Set to compare strings case-insensitively using case folding, 1133 * instead of case-sensitively. 1134 * If set, then the following case folding options are used. 1135 * 1136 * 1137 * @return <0 or 0 or >0 as usual for string comparisons 1138 * 1139 * @see #normalize 1140 * @see #FCD 1141 * @stable ICU 2.8 1142 */ compare(char[] s1, int s1Start, int s1Limit, char[] s2, int s2Start, int s2Limit, int options)1143 public static int compare(char[] s1, int s1Start, int s1Limit, 1144 char[] s2, int s2Start, int s2Limit, 1145 int options) { 1146 if( s1==null || s1Start<0 || s1Limit<0 || 1147 s2==null || s2Start<0 || s2Limit<0 || 1148 s1Limit<s1Start || s2Limit<s2Start 1149 ) { 1150 throw new IllegalArgumentException(); 1151 } 1152 return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start), 1153 CharBuffer.wrap(s2, s2Start, s2Limit-s2Start), 1154 options); 1155 } 1156 1157 /** 1158 * Compare two strings for canonical equivalence. 1159 * Further options include case-insensitive comparison and 1160 * code point order (as opposed to code unit order). 1161 * 1162 * Canonical equivalence between two strings is defined as their normalized 1163 * forms (NFD or NFC) being identical. 1164 * This function compares strings incrementally instead of normalizing 1165 * (and optionally case-folding) both strings entirely, 1166 * improving performance significantly. 1167 * 1168 * Bulk normalization is only necessary if the strings do not fulfill the 1169 * FCD conditions. Only in this case, and only if the strings are relatively 1170 * long, is memory allocated temporarily. 1171 * For FCD strings and short non-FCD strings there is no memory allocation. 1172 * 1173 * Semantically, this is equivalent to 1174 * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2))) 1175 * where code point order and foldCase are all optional. 1176 * 1177 * @param s1 First source string. 1178 * @param s2 Second source string. 1179 * 1180 * @param options A bit set of options: 1181 * - FOLD_CASE_DEFAULT or 0 is used for default options: 1182 * Case-sensitive comparison in code unit order, and the input strings 1183 * are quick-checked for FCD. 1184 * 1185 * - INPUT_IS_FCD 1186 * Set if the caller knows that both s1 and s2 fulfill the FCD 1187 * conditions. If not set, the function will quickCheck for FCD 1188 * and normalize if necessary. 1189 * 1190 * - COMPARE_CODE_POINT_ORDER 1191 * Set to choose code point order instead of code unit order 1192 * 1193 * - COMPARE_IGNORE_CASE 1194 * Set to compare strings case-insensitively using case folding, 1195 * instead of case-sensitively. 1196 * If set, then the following case folding options are used. 1197 * 1198 * @return <0 or 0 or >0 as usual for string comparisons 1199 * 1200 * @see #normalize 1201 * @see #FCD 1202 * @stable ICU 2.8 1203 */ compare(String s1, String s2, int options)1204 public static int compare(String s1, String s2, int options) { 1205 return internalCompare(s1, s2, options); 1206 } 1207 1208 /** 1209 * Compare two strings for canonical equivalence. 1210 * Further options include case-insensitive comparison and 1211 * code point order (as opposed to code unit order). 1212 * Convenience method. 1213 * 1214 * @param s1 First source string. 1215 * @param s2 Second source string. 1216 * 1217 * @param options A bit set of options: 1218 * - FOLD_CASE_DEFAULT or 0 is used for default options: 1219 * Case-sensitive comparison in code unit order, and the input strings 1220 * are quick-checked for FCD. 1221 * 1222 * - INPUT_IS_FCD 1223 * Set if the caller knows that both s1 and s2 fulfill the FCD 1224 * conditions. If not set, the function will quickCheck for FCD 1225 * and normalize if necessary. 1226 * 1227 * - COMPARE_CODE_POINT_ORDER 1228 * Set to choose code point order instead of code unit order 1229 * 1230 * - COMPARE_IGNORE_CASE 1231 * Set to compare strings case-insensitively using case folding, 1232 * instead of case-sensitively. 1233 * If set, then the following case folding options are used. 1234 * 1235 * @return <0 or 0 or >0 as usual for string comparisons 1236 * 1237 * @see #normalize 1238 * @see #FCD 1239 * @stable ICU 2.8 1240 */ compare(char[] s1, char[] s2, int options)1241 public static int compare(char[] s1, char[] s2, int options) { 1242 return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options); 1243 } 1244 1245 /** 1246 * Convenience method that can have faster implementation 1247 * by not allocating buffers. 1248 * @param char32a the first code point to be checked against the 1249 * @param char32b the second code point 1250 * @param options A bit set of options 1251 * @stable ICU 2.8 1252 */ compare(int char32a, int char32b, int options)1253 public static int compare(int char32a, int char32b, int options) { 1254 return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD); 1255 } 1256 1257 /** 1258 * Convenience method that can have faster implementation 1259 * by not allocating buffers. 1260 * @param char32a the first code point to be checked against 1261 * @param str2 the second string 1262 * @param options A bit set of options 1263 * @stable ICU 2.8 1264 */ compare(int char32a, String str2, int options)1265 public static int compare(int char32a, String str2, int options) { 1266 return internalCompare(UTF16.valueOf(char32a), str2, options); 1267 } 1268 1269 /* Concatenation of normalized strings --------------------------------- */ 1270 /** 1271 * Concatenate normalized strings, making sure that the result is normalized 1272 * as well. 1273 * 1274 * If both the left and the right strings are in 1275 * the normalization form according to "mode", 1276 * then the result will be 1277 * 1278 * <code> 1279 * dest=normalize(left+right, mode) 1280 * </code> 1281 * 1282 * With the input strings already being normalized, 1283 * this function will use next() and previous() 1284 * to find the adjacent end pieces of the input strings. 1285 * Only the concatenation of these end pieces will be normalized and 1286 * then concatenated with the remaining parts of the input strings. 1287 * 1288 * It is allowed to have dest==left to avoid copying the entire left string. 1289 * 1290 * @param left Left source array, may be same as dest. 1291 * @param leftStart start in the left array. 1292 * @param leftLimit limit in the left array (==length) 1293 * @param right Right source array. 1294 * @param rightStart start in the right array. 1295 * @param rightLimit limit in the right array (==length) 1296 * @param dest The output buffer; can be null if destStart==destLimit==0 1297 * for pure preflighting. 1298 * @param destStart start in the destination array 1299 * @param destLimit limit in the destination array (==length) 1300 * @param mode The normalization mode. 1301 * @param options The normalization options, ORed together (0 for no options). 1302 * @return Length of output (number of chars) when successful or 1303 * IndexOutOfBoundsException 1304 * @exception IndexOutOfBoundsException whose message has the string 1305 * representation of destination capacity required. 1306 * @see #normalize 1307 * @see #next 1308 * @see #previous 1309 * @exception IndexOutOfBoundsException if target capacity is less than the 1310 * required length 1311 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1312 */ 1313 @Deprecated concatenate(char[] left, int leftStart, int leftLimit, char[] right, int rightStart, int rightLimit, char[] dest, int destStart, int destLimit, Normalizer.Mode mode, int options)1314 public static int concatenate(char[] left, int leftStart, int leftLimit, 1315 char[] right, int rightStart, int rightLimit, 1316 char[] dest, int destStart, int destLimit, 1317 Normalizer.Mode mode, int options) { 1318 if(dest == null) { 1319 throw new IllegalArgumentException(); 1320 } 1321 1322 /* check for overlapping right and destination */ 1323 if (right == dest && rightStart < destLimit && destStart < rightLimit) { 1324 throw new IllegalArgumentException("overlapping right and dst ranges"); 1325 } 1326 1327 /* allow left==dest */ 1328 StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16); 1329 destBuilder.append(left, leftStart, leftLimit-leftStart); 1330 CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart); 1331 mode.getNormalizer2(options).append(destBuilder, rightBuffer); 1332 int destLength=destBuilder.length(); 1333 if(destLength<=(destLimit-destStart)) { 1334 destBuilder.getChars(0, destLength, dest, destStart); 1335 return destLength; 1336 } else { 1337 throw new IndexOutOfBoundsException(Integer.toString(destLength)); 1338 } 1339 } 1340 1341 /** 1342 * Concatenate normalized strings, making sure that the result is normalized 1343 * as well. 1344 * 1345 * If both the left and the right strings are in 1346 * the normalization form according to "mode", 1347 * then the result will be 1348 * 1349 * <code> 1350 * dest=normalize(left+right, mode) 1351 * </code> 1352 * 1353 * For details see concatenate 1354 * 1355 * @param left Left source string. 1356 * @param right Right source string. 1357 * @param mode The normalization mode. 1358 * @param options The normalization options, ORed together (0 for no options). 1359 * @return result 1360 * 1361 * @see #concatenate 1362 * @see #normalize 1363 * @see #next 1364 * @see #previous 1365 * @see #concatenate 1366 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1367 */ 1368 @Deprecated concatenate(char[] left, char[] right,Mode mode, int options)1369 public static String concatenate(char[] left, char[] right,Mode mode, int options) { 1370 StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left); 1371 return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString(); 1372 } 1373 1374 /** 1375 * Concatenate normalized strings, making sure that the result is normalized 1376 * as well. 1377 * 1378 * If both the left and the right strings are in 1379 * the normalization form according to "mode", 1380 * then the result will be 1381 * 1382 * <code> 1383 * dest=normalize(left+right, mode) 1384 * </code> 1385 * 1386 * With the input strings already being normalized, 1387 * this function will use next() and previous() 1388 * to find the adjacent end pieces of the input strings. 1389 * Only the concatenation of these end pieces will be normalized and 1390 * then concatenated with the remaining parts of the input strings. 1391 * 1392 * @param left Left source string. 1393 * @param right Right source string. 1394 * @param mode The normalization mode. 1395 * @param options The normalization options, ORed together (0 for no options). 1396 * @return result 1397 * 1398 * @see #concatenate 1399 * @see #normalize 1400 * @see #next 1401 * @see #previous 1402 * @see #concatenate 1403 * @deprecated ICU 56 Use {@link Normalizer2} instead. 1404 */ 1405 @Deprecated concatenate(String left, String right, Mode mode, int options)1406 public static String concatenate(String left, String right, Mode mode, int options) { 1407 StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left); 1408 return mode.getNormalizer2(options).append(dest, right).toString(); 1409 } 1410 1411 /** 1412 * Gets the FC_NFKC closure value. 1413 * @param c The code point whose closure value is to be retrieved 1414 * @param dest The char array to receive the closure value 1415 * @return the length of the closure value; 0 if there is none 1416 * @deprecated ICU 56 1417 */ 1418 @Deprecated getFC_NFKC_Closure(int c,char[] dest)1419 public static int getFC_NFKC_Closure(int c,char[] dest) { 1420 String closure=getFC_NFKC_Closure(c); 1421 int length=closure.length(); 1422 if(length!=0 && dest!=null && length<=dest.length) { 1423 closure.getChars(0, length, dest, 0); 1424 } 1425 return length; 1426 } 1427 /** 1428 * Gets the FC_NFKC closure value. 1429 * @param c The code point whose closure value is to be retrieved 1430 * @return String representation of the closure value; "" if there is none 1431 * @deprecated ICU 56 1432 */ 1433 @Deprecated getFC_NFKC_Closure(int c)1434 public static String getFC_NFKC_Closure(int c) { 1435 // Compute the FC_NFKC_Closure on the fly: 1436 // We have the API for complete coverage of Unicode properties, although 1437 // this value by itself is not useful via API. 1438 // (What could be useful is a custom normalization table that combines 1439 // case folding and NFKC.) 1440 // For the derivation, see Unicode's DerivedNormalizationProps.txt. 1441 Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2; 1442 UCaseProps csp=UCaseProps.INSTANCE; 1443 // first: b = NFKC(Fold(a)) 1444 StringBuilder folded=new StringBuilder(); 1445 int folded1Length=csp.toFullFolding(c, folded, 0); 1446 if(folded1Length<0) { 1447 Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl; 1448 if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) { 1449 return ""; // c does not change at all under CaseFolding+NFKC 1450 } 1451 folded.appendCodePoint(c); 1452 } else { 1453 if(folded1Length>UCaseProps.MAX_STRING_LENGTH) { 1454 folded.appendCodePoint(folded1Length); 1455 } 1456 } 1457 String kc1=nfkc.normalize(folded); 1458 // second: c = NFKC(Fold(b)) 1459 String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0)); 1460 // if (c != b) add the mapping from a to c 1461 if(kc1.equals(kc2)) { 1462 return ""; 1463 } else { 1464 return kc2; 1465 } 1466 } 1467 1468 //------------------------------------------------------------------------- 1469 // Iteration API 1470 //------------------------------------------------------------------------- 1471 1472 /** 1473 * Return the current character in the normalized text. 1474 * @return The codepoint as an int 1475 * @deprecated ICU 56 1476 */ 1477 @Deprecated current()1478 public int current() { 1479 if(bufferPos<buffer.length() || nextNormalize()) { 1480 return buffer.codePointAt(bufferPos); 1481 } else { 1482 return DONE; 1483 } 1484 } 1485 1486 /** 1487 * Return the next character in the normalized text and advance 1488 * the iteration position by one. If the end 1489 * of the text has already been reached, {@link #DONE} is returned. 1490 * @return The codepoint as an int 1491 * @deprecated ICU 56 1492 */ 1493 @Deprecated next()1494 public int next() { 1495 if(bufferPos<buffer.length() || nextNormalize()) { 1496 int c=buffer.codePointAt(bufferPos); 1497 bufferPos+=Character.charCount(c); 1498 return c; 1499 } else { 1500 return DONE; 1501 } 1502 } 1503 1504 1505 /** 1506 * Return the previous character in the normalized text and decrement 1507 * the iteration position by one. If the beginning 1508 * of the text has already been reached, {@link #DONE} is returned. 1509 * @return The codepoint as an int 1510 * @deprecated ICU 56 1511 */ 1512 @Deprecated previous()1513 public int previous() { 1514 if(bufferPos>0 || previousNormalize()) { 1515 int c=buffer.codePointBefore(bufferPos); 1516 bufferPos-=Character.charCount(c); 1517 return c; 1518 } else { 1519 return DONE; 1520 } 1521 } 1522 1523 /** 1524 * Reset the index to the beginning of the text. 1525 * This is equivalent to setIndexOnly(startIndex)). 1526 * @deprecated ICU 56 1527 */ 1528 @Deprecated reset()1529 public void reset() { 1530 text.setToStart(); 1531 currentIndex=nextIndex=0; 1532 clearBuffer(); 1533 } 1534 1535 /** 1536 * Set the iteration position in the input text that is being normalized, 1537 * without any immediate normalization. 1538 * After setIndexOnly(), getIndex() will return the same index that is 1539 * specified here. 1540 * 1541 * @param index the desired index in the input text. 1542 * @deprecated ICU 56 1543 */ 1544 @Deprecated setIndexOnly(int index)1545 public void setIndexOnly(int index) { 1546 text.setIndex(index); // validates index 1547 currentIndex=nextIndex=index; 1548 clearBuffer(); 1549 } 1550 1551 /** 1552 * Set the iteration position in the input text that is being normalized 1553 * and return the first normalized character at that position. 1554 * <p> 1555 * <b>Note:</b> This method sets the position in the <em>input</em> text, 1556 * while {@link #next} and {@link #previous} iterate through characters 1557 * in the normalized <em>output</em>. This means that there is not 1558 * necessarily a one-to-one correspondence between characters returned 1559 * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and 1560 * returned from <tt>setIndex</tt> and {@link #getIndex}. 1561 * <p> 1562 * @param index the desired index in the input text. 1563 * 1564 * @return the first normalized character that is the result of iterating 1565 * forward starting at the given index. 1566 * 1567 * @throws IllegalArgumentException if the given index is less than 1568 * {@link #getBeginIndex} or greater than {@link #getEndIndex}. 1569 * @deprecated ICU 3.2 1570 * @obsolete ICU 3.2 1571 */ 1572 @Deprecated 1573 ///CLOVER:OFF setIndex(int index)1574 public int setIndex(int index) { 1575 setIndexOnly(index); 1576 return current(); 1577 } 1578 ///CLOVER:ON 1579 /** 1580 * Retrieve the index of the start of the input text. This is the begin 1581 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the 1582 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating 1583 * @deprecated ICU 2.2. Use startIndex() instead. 1584 * @return The codepoint as an int 1585 * @see #startIndex 1586 */ 1587 @Deprecated getBeginIndex()1588 public int getBeginIndex() { 1589 return 0; 1590 } 1591 1592 /** 1593 * Retrieve the index of the end of the input text. This is the end index 1594 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 1595 * over which this <tt>Normalizer</tt> is iterating 1596 * @deprecated ICU 2.2. Use endIndex() instead. 1597 * @return The codepoint as an int 1598 * @see #endIndex 1599 */ 1600 @Deprecated getEndIndex()1601 public int getEndIndex() { 1602 return endIndex(); 1603 } 1604 /** 1605 * Return the first character in the normalized text. This resets 1606 * the <tt>Normalizer's</tt> position to the beginning of the text. 1607 * @return The codepoint as an int 1608 * @deprecated ICU 56 1609 */ 1610 @Deprecated first()1611 public int first() { 1612 reset(); 1613 return next(); 1614 } 1615 1616 /** 1617 * Return the last character in the normalized text. This resets 1618 * the <tt>Normalizer's</tt> position to be just before the 1619 * the input text corresponding to that normalized character. 1620 * @return The codepoint as an int 1621 * @deprecated ICU 56 1622 */ 1623 @Deprecated last()1624 public int last() { 1625 text.setToLimit(); 1626 currentIndex=nextIndex=text.getIndex(); 1627 clearBuffer(); 1628 return previous(); 1629 } 1630 1631 /** 1632 * Retrieve the current iteration position in the input text that is 1633 * being normalized. This method is useful in applications such as 1634 * searching, where you need to be able to determine the position in 1635 * the input text that corresponds to a given normalized output character. 1636 * <p> 1637 * <b>Note:</b> This method sets the position in the <em>input</em>, while 1638 * {@link #next} and {@link #previous} iterate through characters in the 1639 * <em>output</em>. This means that there is not necessarily a one-to-one 1640 * correspondence between characters returned by <tt>next</tt> and 1641 * <tt>previous</tt> and the indices passed to and returned from 1642 * <tt>setIndex</tt> and {@link #getIndex}. 1643 * @return The current iteration position 1644 * @deprecated ICU 56 1645 */ 1646 @Deprecated getIndex()1647 public int getIndex() { 1648 if(bufferPos<buffer.length()) { 1649 return currentIndex; 1650 } else { 1651 return nextIndex; 1652 } 1653 } 1654 1655 /** 1656 * Retrieve the index of the start of the input text. This is the begin 1657 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the 1658 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating 1659 * @return The current iteration position 1660 * @deprecated ICU 56 1661 */ 1662 @Deprecated startIndex()1663 public int startIndex() { 1664 return 0; 1665 } 1666 1667 /** 1668 * Retrieve the index of the end of the input text. This is the end index 1669 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 1670 * over which this <tt>Normalizer</tt> is iterating 1671 * @return The current iteration position 1672 * @deprecated ICU 56 1673 */ 1674 @Deprecated endIndex()1675 public int endIndex() { 1676 return text.getLength(); 1677 } 1678 1679 //------------------------------------------------------------------------- 1680 // Iterator attributes 1681 //------------------------------------------------------------------------- 1682 /** 1683 * Set the normalization mode for this object. 1684 * <p> 1685 * <b>Note:</b>If the normalization mode is changed while iterating 1686 * over a string, calls to {@link #next} and {@link #previous} may 1687 * return previously buffers characters in the old normalization mode 1688 * until the iteration is able to re-sync at the next base character. 1689 * It is safest to call {@link #setText setText()}, {@link #first}, 1690 * {@link #last}, etc. after calling <tt>setMode</tt>. 1691 * <p> 1692 * @param newMode the new mode for this <tt>Normalizer</tt>. 1693 * The supported modes are: 1694 * <ul> 1695 * <li>{@link #NFC} - Unicode canonical decompositiion 1696 * followed by canonical composition. 1697 * <li>{@link #NFKC} - Unicode compatibility decompositiion 1698 * follwed by canonical composition. 1699 * <li>{@link #NFD} - Unicode canonical decomposition 1700 * <li>{@link #NFKD} - Unicode compatibility decomposition. 1701 * <li>{@link #NONE} - Do nothing but return characters 1702 * from the underlying input text. 1703 * </ul> 1704 * 1705 * @see #getMode 1706 * @deprecated ICU 56 1707 */ 1708 @Deprecated setMode(Mode newMode)1709 public void setMode(Mode newMode) { 1710 mode = newMode; 1711 norm2 = mode.getNormalizer2(options); 1712 } 1713 /** 1714 * Return the basic operation performed by this <tt>Normalizer</tt> 1715 * 1716 * @see #setMode 1717 * @deprecated ICU 56 1718 */ 1719 @Deprecated getMode()1720 public Mode getMode() { 1721 return mode; 1722 } 1723 /** 1724 * Set options that affect this <tt>Normalizer</tt>'s operation. 1725 * Options do not change the basic composition or decomposition operation 1726 * that is being performed , but they control whether 1727 * certain optional portions of the operation are done. 1728 * Currently the only available option is: 1729 * 1730 * <ul> 1731 * <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2. 1732 * </ul> 1733 * 1734 * @param option the option whose value is to be set. 1735 * @param value the new setting for the option. Use <tt>true</tt> to 1736 * turn the option on and <tt>false</tt> to turn it off. 1737 * 1738 * @see #getOption 1739 * @deprecated ICU 56 1740 */ 1741 @Deprecated setOption(int option,boolean value)1742 public void setOption(int option,boolean value) { 1743 if (value) { 1744 options |= option; 1745 } else { 1746 options &= (~option); 1747 } 1748 norm2 = mode.getNormalizer2(options); 1749 } 1750 1751 /** 1752 * Determine whether an option is turned on or off. 1753 * <p> 1754 * @see #setOption 1755 * @deprecated ICU 56 1756 */ 1757 @Deprecated getOption(int option)1758 public int getOption(int option) { 1759 if((options & option)!=0) { 1760 return 1 ; 1761 } else { 1762 return 0; 1763 } 1764 } 1765 1766 /** 1767 * Gets the underlying text storage 1768 * @param fillIn the char buffer to fill the UTF-16 units. 1769 * The length of the buffer should be equal to the length of the 1770 * underlying text storage 1771 * @throws IndexOutOfBoundsException If the index passed for the array is invalid. 1772 * @see #getLength 1773 * @deprecated ICU 56 1774 */ 1775 @Deprecated getText(char[] fillIn)1776 public int getText(char[] fillIn) { 1777 return text.getText(fillIn); 1778 } 1779 1780 /** 1781 * Gets the length of underlying text storage 1782 * @return the length 1783 * @deprecated ICU 56 1784 */ 1785 @Deprecated getLength()1786 public int getLength() { 1787 return text.getLength(); 1788 } 1789 1790 /** 1791 * Returns the text under iteration as a string 1792 * @return a copy of the text under iteration. 1793 * @deprecated ICU 56 1794 */ 1795 @Deprecated getText()1796 public String getText() { 1797 return text.getText(); 1798 } 1799 1800 /** 1801 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1802 * The iteration position is set to the beginning of the input text. 1803 * @param newText The new string to be normalized. 1804 * @deprecated ICU 56 1805 */ 1806 @Deprecated setText(StringBuffer newText)1807 public void setText(StringBuffer newText) { 1808 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1809 if (newIter == null) { 1810 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1811 } 1812 text = newIter; 1813 reset(); 1814 } 1815 1816 /** 1817 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1818 * The iteration position is set to the beginning of the input text. 1819 * @param newText The new string to be normalized. 1820 * @deprecated ICU 56 1821 */ 1822 @Deprecated setText(char[] newText)1823 public void setText(char[] newText) { 1824 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1825 if (newIter == null) { 1826 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1827 } 1828 text = newIter; 1829 reset(); 1830 } 1831 1832 /** 1833 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1834 * The iteration position is set to the beginning of the input text. 1835 * @param newText The new string to be normalized. 1836 * @deprecated ICU 56 1837 */ 1838 @Deprecated setText(String newText)1839 public void setText(String newText) { 1840 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1841 if (newIter == null) { 1842 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1843 } 1844 text = newIter; 1845 reset(); 1846 } 1847 1848 /** 1849 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1850 * The iteration position is set to the beginning of the input text. 1851 * @param newText The new string to be normalized. 1852 * @deprecated ICU 56 1853 */ 1854 @Deprecated setText(CharacterIterator newText)1855 public void setText(CharacterIterator newText) { 1856 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1857 if (newIter == null) { 1858 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1859 } 1860 text = newIter; 1861 reset(); 1862 } 1863 1864 /** 1865 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1866 * The iteration position is set to the beginning of the string. 1867 * @param newText The new string to be normalized. 1868 * @deprecated ICU 56 1869 */ 1870 @Deprecated setText(UCharacterIterator newText)1871 public void setText(UCharacterIterator newText) { 1872 try{ 1873 UCharacterIterator newIter = (UCharacterIterator)newText.clone(); 1874 if (newIter == null) { 1875 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1876 } 1877 text = newIter; 1878 reset(); 1879 }catch(CloneNotSupportedException e) { 1880 throw new ICUCloneNotSupportedException("Could not clone the UCharacterIterator", e); 1881 } 1882 } 1883 clearBuffer()1884 private void clearBuffer() { 1885 buffer.setLength(0); 1886 bufferPos=0; 1887 } 1888 nextNormalize()1889 private boolean nextNormalize() { 1890 clearBuffer(); 1891 currentIndex=nextIndex; 1892 text.setIndex(nextIndex); 1893 // Skip at least one character so we make progress. 1894 int c=text.nextCodePoint(); 1895 if(c<0) { 1896 return false; 1897 } 1898 StringBuilder segment=new StringBuilder().appendCodePoint(c); 1899 while((c=text.nextCodePoint())>=0) { 1900 if(norm2.hasBoundaryBefore(c)) { 1901 text.moveCodePointIndex(-1); 1902 break; 1903 } 1904 segment.appendCodePoint(c); 1905 } 1906 nextIndex=text.getIndex(); 1907 norm2.normalize(segment, buffer); 1908 return buffer.length()!=0; 1909 } 1910 previousNormalize()1911 private boolean previousNormalize() { 1912 clearBuffer(); 1913 nextIndex=currentIndex; 1914 text.setIndex(currentIndex); 1915 StringBuilder segment=new StringBuilder(); 1916 int c; 1917 while((c=text.previousCodePoint())>=0) { 1918 if(c<=0xffff) { 1919 segment.insert(0, (char)c); 1920 } else { 1921 segment.insert(0, Character.toChars(c)); 1922 } 1923 if(norm2.hasBoundaryBefore(c)) { 1924 break; 1925 } 1926 } 1927 currentIndex=text.getIndex(); 1928 norm2.normalize(segment, buffer); 1929 bufferPos=buffer.length(); 1930 return buffer.length()!=0; 1931 } 1932 1933 /* compare canonically equivalent ------------------------------------------- */ 1934 1935 // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407 internalCompare(CharSequence s1, CharSequence s2, int options)1936 private static int internalCompare(CharSequence s1, CharSequence s2, int options) { 1937 int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT; 1938 options|= COMPARE_EQUIV; 1939 1940 /* 1941 * UAX #21 Case Mappings, as fixed for Unicode version 4 1942 * (see Jitterbug 2021), defines a canonical caseless match as 1943 * 1944 * A string X is a canonical caseless match 1945 * for a string Y if and only if 1946 * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) 1947 * 1948 * For better performance, we check for FCD (or let the caller tell us that 1949 * both strings are in FCD) for the inner normalization. 1950 * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that 1951 * case-folding preserves the FCD-ness of a string. 1952 * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold() 1953 * when there is a difference. 1954 * 1955 * Exception: When using the Turkic case-folding option, we do perform 1956 * full NFD first. This is because in the Turkic case precomposed characters 1957 * with 0049 capital I or 0069 small i fold differently whether they 1958 * are first decomposed or not, so an FCD check - a check only for 1959 * canonical order - is not sufficient. 1960 */ 1961 if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) { 1962 Normalizer2 n2; 1963 if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) { 1964 n2=NFD.getNormalizer2(normOptions); 1965 } else { 1966 n2=FCD.getNormalizer2(normOptions); 1967 } 1968 1969 // check if s1 and/or s2 fulfill the FCD conditions 1970 int spanQCYes1=n2.spanQuickCheckYes(s1); 1971 int spanQCYes2=n2.spanQuickCheckYes(s2); 1972 1973 /* 1974 * ICU 2.4 had a further optimization: 1975 * If both strings were not in FCD, then they were both NFD'ed, 1976 * and the COMPARE_EQUIV option was turned off. 1977 * It is not entirely clear that this is valid with the current 1978 * definition of the canonical caseless match. 1979 * Therefore, ICU 2.6 removes that optimization. 1980 */ 1981 1982 if(spanQCYes1<s1.length()) { 1983 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1); 1984 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length())); 1985 } 1986 if(spanQCYes2<s2.length()) { 1987 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2); 1988 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length())); 1989 } 1990 } 1991 1992 return cmpEquivFold(s1, s2, options); 1993 } 1994 1995 /* 1996 * Compare two strings for canonical equivalence. 1997 * Further options include case-insensitive comparison and 1998 * code point order (as opposed to code unit order). 1999 * 2000 * In this function, canonical equivalence is optional as well. 2001 * If canonical equivalence is tested, then both strings must fulfill 2002 * the FCD check. 2003 * 2004 * Semantically, this is equivalent to 2005 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) 2006 * where code point order, NFD and foldCase are all optional. 2007 * 2008 * String comparisons almost always yield results before processing both strings 2009 * completely. 2010 * They are generally more efficient working incrementally instead of 2011 * performing the sub-processing (strlen, normalization, case-folding) 2012 * on the entire strings first. 2013 * 2014 * It is also unnecessary to not normalize identical characters. 2015 * 2016 * This function works in principle as follows: 2017 * 2018 * loop { 2019 * get one code unit c1 from s1 (-1 if end of source) 2020 * get one code unit c2 from s2 (-1 if end of source) 2021 * 2022 * if(either string finished) { 2023 * return result; 2024 * } 2025 * if(c1==c2) { 2026 * continue; 2027 * } 2028 * 2029 * // c1!=c2 2030 * try to decompose/case-fold c1/c2, and continue if one does; 2031 * 2032 * // still c1!=c2 and neither decomposes/case-folds, return result 2033 * return c1-c2; 2034 * } 2035 * 2036 * When a character decomposes, then the pointer for that source changes to 2037 * the decomposition, pushing the previous pointer onto a stack. 2038 * When the end of the decomposition is reached, then the code unit reader 2039 * pops the previous source from the stack. 2040 * (Same for case-folding.) 2041 * 2042 * This is complicated further by operating on variable-width UTF-16. 2043 * The top part of the loop works on code units, while lookups for decomposition 2044 * and case-folding need code points. 2045 * Code points are assembled after the equality/end-of-source part. 2046 * The source pointer is only advanced beyond all code units when the code point 2047 * actually decomposes/case-folds. 2048 * 2049 * If we were on a trail surrogate unit when assembling a code point, 2050 * and the code point decomposes/case-folds, then the decomposition/folding 2051 * result must be compared with the part of the other string that corresponds to 2052 * this string's lead surrogate. 2053 * Since we only assemble a code point when hitting a trail unit when the 2054 * preceding lead units were identical, we back up the other string by one unit 2055 * in such a case. 2056 * 2057 * The optional code point order comparison at the end works with 2058 * the same fix-up as the other code point order comparison functions. 2059 * See ustring.c and the comment near the end of this function. 2060 * 2061 * Assumption: A decomposition or case-folding result string never contains 2062 * a single surrogate. This is a safe assumption in the Unicode Standard. 2063 * Therefore, we do not need to check for surrogate pairs across 2064 * decomposition/case-folding boundaries. 2065 * 2066 * Further assumptions (see verifications tstnorm.cpp): 2067 * The API function checks for FCD first, while the core function 2068 * first case-folds and then decomposes. This requires that case-folding does not 2069 * un-FCD any strings. 2070 * 2071 * The API function may also NFD the input and turn off decomposition. 2072 * This requires that case-folding does not un-NFD strings either. 2073 * 2074 * TODO If any of the above two assumptions is violated, 2075 * then this entire code must be re-thought. 2076 * If this happens, then a simple solution is to case-fold both strings up front 2077 * and to turn off UNORM_INPUT_IS_FCD. 2078 * We already do this when not both strings are in FCD because makeFCD 2079 * would be a partial NFD before the case folding, which does not work. 2080 * Note that all of this is only a problem when case-folding _and_ 2081 * canonical equivalence come together. 2082 * (Comments in unorm_compare() are more up to date than this TODO.) 2083 */ 2084 2085 /* stack element for previous-level source/decomposition pointers */ 2086 private static final class CmpEquivLevel { 2087 CharSequence cs; 2088 int s; 2089 }; createCmpEquivLevelStack()2090 private static final CmpEquivLevel[] createCmpEquivLevelStack() { 2091 return new CmpEquivLevel[] { 2092 new CmpEquivLevel(), new CmpEquivLevel() 2093 }; 2094 } 2095 2096 /** 2097 * Internal option for unorm_cmpEquivFold() for decomposing. 2098 * If not set, just do strcasecmp(). 2099 */ 2100 private static final int COMPARE_EQUIV=0x80000; 2101 2102 /* internal function; package visibility for use by UTF16.StringComparator */ cmpEquivFold(CharSequence cs1, CharSequence cs2, int options)2103 /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) { 2104 Normalizer2Impl nfcImpl; 2105 UCaseProps csp; 2106 2107 /* current-level start/limit - s1/s2 as current */ 2108 int s1, s2, limit1, limit2; 2109 2110 /* decomposition and case folding variables */ 2111 int length; 2112 2113 /* stacks of previous-level start/current/limit */ 2114 CmpEquivLevel[] stack1=null, stack2=null; 2115 2116 /* buffers for algorithmic decompositions */ 2117 String decomp1, decomp2; 2118 2119 /* case folding buffers, only use current-level start/limit */ 2120 StringBuilder fold1, fold2; 2121 2122 /* track which is the current level per string */ 2123 int level1, level2; 2124 2125 /* current code units, and code points for lookups */ 2126 int c1, c2, cp1, cp2; 2127 2128 /* no argument error checking because this itself is not an API */ 2129 2130 /* 2131 * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set 2132 * otherwise this function must behave exactly as uprv_strCompare() 2133 * not checking for that here makes testing this function easier 2134 */ 2135 2136 /* normalization/properties data loaded? */ 2137 if((options&COMPARE_EQUIV)!=0) { 2138 nfcImpl=Norm2AllModes.getNFCInstance().impl; 2139 } else { 2140 nfcImpl=null; 2141 } 2142 if((options&COMPARE_IGNORE_CASE)!=0) { 2143 csp=UCaseProps.INSTANCE; 2144 fold1=new StringBuilder(); 2145 fold2=new StringBuilder(); 2146 } else { 2147 csp=null; 2148 fold1=fold2=null; 2149 } 2150 2151 /* initialize */ 2152 s1=0; 2153 limit1=cs1.length(); 2154 s2=0; 2155 limit2=cs2.length(); 2156 2157 level1=level2=0; 2158 c1=c2=-1; 2159 2160 /* comparison loop */ 2161 for(;;) { 2162 /* 2163 * here a code unit value of -1 means "get another code unit" 2164 * below it will mean "this source is finished" 2165 */ 2166 2167 if(c1<0) { 2168 /* get next code unit from string 1, post-increment */ 2169 for(;;) { 2170 if(s1==limit1) { 2171 if(level1==0) { 2172 c1=-1; 2173 break; 2174 } 2175 } else { 2176 c1=cs1.charAt(s1++); 2177 break; 2178 } 2179 2180 /* reached end of level buffer, pop one level */ 2181 do { 2182 --level1; 2183 cs1=stack1[level1].cs; 2184 } while(cs1==null); 2185 s1=stack1[level1].s; 2186 limit1=cs1.length(); 2187 } 2188 } 2189 2190 if(c2<0) { 2191 /* get next code unit from string 2, post-increment */ 2192 for(;;) { 2193 if(s2==limit2) { 2194 if(level2==0) { 2195 c2=-1; 2196 break; 2197 } 2198 } else { 2199 c2=cs2.charAt(s2++); 2200 break; 2201 } 2202 2203 /* reached end of level buffer, pop one level */ 2204 do { 2205 --level2; 2206 cs2=stack2[level2].cs; 2207 } while(cs2==null); 2208 s2=stack2[level2].s; 2209 limit2=cs2.length(); 2210 } 2211 } 2212 2213 /* 2214 * compare c1 and c2 2215 * either variable c1, c2 is -1 only if the corresponding string is finished 2216 */ 2217 if(c1==c2) { 2218 if(c1<0) { 2219 return 0; /* c1==c2==-1 indicating end of strings */ 2220 } 2221 c1=c2=-1; /* make us fetch new code units */ 2222 continue; 2223 } else if(c1<0) { 2224 return -1; /* string 1 ends before string 2 */ 2225 } else if(c2<0) { 2226 return 1; /* string 2 ends before string 1 */ 2227 } 2228 /* c1!=c2 && c1>=0 && c2>=0 */ 2229 2230 /* get complete code points for c1, c2 for lookups if either is a surrogate */ 2231 cp1=c1; 2232 if(UTF16.isSurrogate((char)c1)) { 2233 char c; 2234 2235 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 2236 if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) { 2237 /* advance ++s1; only below if cp1 decomposes/case-folds */ 2238 cp1=Character.toCodePoint((char)c1, c); 2239 } 2240 } else /* isTrail(c1) */ { 2241 if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) { 2242 cp1=Character.toCodePoint(c, (char)c1); 2243 } 2244 } 2245 } 2246 2247 cp2=c2; 2248 if(UTF16.isSurrogate((char)c2)) { 2249 char c; 2250 2251 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 2252 if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) { 2253 /* advance ++s2; only below if cp2 decomposes/case-folds */ 2254 cp2=Character.toCodePoint((char)c2, c); 2255 } 2256 } else /* isTrail(c2) */ { 2257 if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) { 2258 cp2=Character.toCodePoint(c, (char)c2); 2259 } 2260 } 2261 } 2262 2263 /* 2264 * go down one level for each string 2265 * continue with the main loop as soon as there is a real change 2266 */ 2267 2268 if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 && 2269 (length=csp.toFullFolding(cp1, fold1, options))>=0 2270 ) { 2271 /* cp1 case-folds to the code point "length" or to p[length] */ 2272 if(UTF16.isSurrogate((char)c1)) { 2273 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 2274 /* advance beyond source surrogate pair if it case-folds */ 2275 ++s1; 2276 } else /* isTrail(c1) */ { 2277 /* 2278 * we got a supplementary code point when hitting its trail surrogate, 2279 * therefore the lead surrogate must have been the same as in the other string; 2280 * compare this decomposition with the lead surrogate in the other string 2281 * remember that this simulates bulk text replacement: 2282 * the decomposition would replace the entire code point 2283 */ 2284 --s2; 2285 c2=cs2.charAt(s2-1); 2286 } 2287 } 2288 2289 /* push current level pointers */ 2290 if(stack1==null) { 2291 stack1=createCmpEquivLevelStack(); 2292 } 2293 stack1[0].cs=cs1; 2294 stack1[0].s=s1; 2295 ++level1; 2296 2297 /* copy the folding result to fold1[] */ 2298 /* Java: the buffer was probably not empty, remove the old contents */ 2299 if(length<=UCaseProps.MAX_STRING_LENGTH) { 2300 fold1.delete(0, fold1.length()-length); 2301 } else { 2302 fold1.setLength(0); 2303 fold1.appendCodePoint(length); 2304 } 2305 2306 /* set next level pointers to case folding */ 2307 cs1=fold1; 2308 s1=0; 2309 limit1=fold1.length(); 2310 2311 /* get ready to read from decomposition, continue with loop */ 2312 c1=-1; 2313 continue; 2314 } 2315 2316 if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 && 2317 (length=csp.toFullFolding(cp2, fold2, options))>=0 2318 ) { 2319 /* cp2 case-folds to the code point "length" or to p[length] */ 2320 if(UTF16.isSurrogate((char)c2)) { 2321 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 2322 /* advance beyond source surrogate pair if it case-folds */ 2323 ++s2; 2324 } else /* isTrail(c2) */ { 2325 /* 2326 * we got a supplementary code point when hitting its trail surrogate, 2327 * therefore the lead surrogate must have been the same as in the other string; 2328 * compare this decomposition with the lead surrogate in the other string 2329 * remember that this simulates bulk text replacement: 2330 * the decomposition would replace the entire code point 2331 */ 2332 --s1; 2333 c1=cs1.charAt(s1-1); 2334 } 2335 } 2336 2337 /* push current level pointers */ 2338 if(stack2==null) { 2339 stack2=createCmpEquivLevelStack(); 2340 } 2341 stack2[0].cs=cs2; 2342 stack2[0].s=s2; 2343 ++level2; 2344 2345 /* copy the folding result to fold2[] */ 2346 /* Java: the buffer was probably not empty, remove the old contents */ 2347 if(length<=UCaseProps.MAX_STRING_LENGTH) { 2348 fold2.delete(0, fold2.length()-length); 2349 } else { 2350 fold2.setLength(0); 2351 fold2.appendCodePoint(length); 2352 } 2353 2354 /* set next level pointers to case folding */ 2355 cs2=fold2; 2356 s2=0; 2357 limit2=fold2.length(); 2358 2359 /* get ready to read from decomposition, continue with loop */ 2360 c2=-1; 2361 continue; 2362 } 2363 2364 if( level1<2 && (options&COMPARE_EQUIV)!=0 && 2365 (decomp1=nfcImpl.getDecomposition(cp1))!=null 2366 ) { 2367 /* cp1 decomposes into p[length] */ 2368 if(UTF16.isSurrogate((char)c1)) { 2369 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 2370 /* advance beyond source surrogate pair if it decomposes */ 2371 ++s1; 2372 } else /* isTrail(c1) */ { 2373 /* 2374 * we got a supplementary code point when hitting its trail surrogate, 2375 * therefore the lead surrogate must have been the same as in the other string; 2376 * compare this decomposition with the lead surrogate in the other string 2377 * remember that this simulates bulk text replacement: 2378 * the decomposition would replace the entire code point 2379 */ 2380 --s2; 2381 c2=cs2.charAt(s2-1); 2382 } 2383 } 2384 2385 /* push current level pointers */ 2386 if(stack1==null) { 2387 stack1=createCmpEquivLevelStack(); 2388 } 2389 stack1[level1].cs=cs1; 2390 stack1[level1].s=s1; 2391 ++level1; 2392 2393 /* set empty intermediate level if skipped */ 2394 if(level1<2) { 2395 stack1[level1++].cs=null; 2396 } 2397 2398 /* set next level pointers to decomposition */ 2399 cs1=decomp1; 2400 s1=0; 2401 limit1=decomp1.length(); 2402 2403 /* get ready to read from decomposition, continue with loop */ 2404 c1=-1; 2405 continue; 2406 } 2407 2408 if( level2<2 && (options&COMPARE_EQUIV)!=0 && 2409 (decomp2=nfcImpl.getDecomposition(cp2))!=null 2410 ) { 2411 /* cp2 decomposes into p[length] */ 2412 if(UTF16.isSurrogate((char)c2)) { 2413 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 2414 /* advance beyond source surrogate pair if it decomposes */ 2415 ++s2; 2416 } else /* isTrail(c2) */ { 2417 /* 2418 * we got a supplementary code point when hitting its trail surrogate, 2419 * therefore the lead surrogate must have been the same as in the other string; 2420 * compare this decomposition with the lead surrogate in the other string 2421 * remember that this simulates bulk text replacement: 2422 * the decomposition would replace the entire code point 2423 */ 2424 --s1; 2425 c1=cs1.charAt(s1-1); 2426 } 2427 } 2428 2429 /* push current level pointers */ 2430 if(stack2==null) { 2431 stack2=createCmpEquivLevelStack(); 2432 } 2433 stack2[level2].cs=cs2; 2434 stack2[level2].s=s2; 2435 ++level2; 2436 2437 /* set empty intermediate level if skipped */ 2438 if(level2<2) { 2439 stack2[level2++].cs=null; 2440 } 2441 2442 /* set next level pointers to decomposition */ 2443 cs2=decomp2; 2444 s2=0; 2445 limit2=decomp2.length(); 2446 2447 /* get ready to read from decomposition, continue with loop */ 2448 c2=-1; 2449 continue; 2450 } 2451 2452 /* 2453 * no decomposition/case folding, max level for both sides: 2454 * return difference result 2455 * 2456 * code point order comparison must not just return cp1-cp2 2457 * because when single surrogates are present then the surrogate pairs 2458 * that formed cp1 and cp2 may be from different string indexes 2459 * 2460 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units 2461 * c1=d800 cp1=10001 c2=dc00 cp2=10000 2462 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } 2463 * 2464 * therefore, use same fix-up as in ustring.c/uprv_strCompare() 2465 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ 2466 * so we have slightly different pointer/start/limit comparisons here 2467 */ 2468 2469 if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) { 2470 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 2471 if( 2472 (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) || 2473 (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2))) 2474 ) { 2475 /* part of a surrogate pair, leave >=d800 */ 2476 } else { 2477 /* BMP code point - may be surrogate code point - make <d800 */ 2478 c1-=0x2800; 2479 } 2480 2481 if( 2482 (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) || 2483 (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2))) 2484 ) { 2485 /* part of a surrogate pair, leave >=d800 */ 2486 } else { 2487 /* BMP code point - may be surrogate code point - make <d800 */ 2488 c2-=0x2800; 2489 } 2490 } 2491 2492 return c1-c2; 2493 } 2494 } 2495 2496 /** 2497 * An Appendable that writes into a char array with a capacity that may be 2498 * less than array.length. 2499 * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.) 2500 * <p> 2501 * An overflow is only reported at the end, for the old Normalizer API functions that write 2502 * to char arrays. 2503 */ 2504 private static final class CharsAppendable implements Appendable { CharsAppendable(char[] dest, int destStart, int destLimit)2505 public CharsAppendable(char[] dest, int destStart, int destLimit) { 2506 chars=dest; 2507 start=offset=destStart; 2508 limit=destLimit; 2509 } length()2510 public int length() { 2511 int len=offset-start; 2512 if(offset<=limit) { 2513 return len; 2514 } else { 2515 throw new IndexOutOfBoundsException(Integer.toString(len)); 2516 } 2517 } append(char c)2518 public Appendable append(char c) { 2519 if(offset<limit) { 2520 chars[offset]=c; 2521 } 2522 ++offset; 2523 return this; 2524 } append(CharSequence s)2525 public Appendable append(CharSequence s) { 2526 return append(s, 0, s.length()); 2527 } append(CharSequence s, int sStart, int sLimit)2528 public Appendable append(CharSequence s, int sStart, int sLimit) { 2529 int len=sLimit-sStart; 2530 if(len<=(limit-offset)) { 2531 while(sStart<sLimit) { // TODO: Is there a better way to copy the characters? 2532 chars[offset++]=s.charAt(sStart++); 2533 } 2534 } else { 2535 offset+=len; 2536 } 2537 return this; 2538 } 2539 2540 private final char[] chars; 2541 private final int start, limit; 2542 private int offset; 2543 } 2544 } 2545