1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /** 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.text; 11 12 import com.ibm.icu.impl.Utility; 13 14 /** 15 * <p> 16 * Standalone utility class providing UTF16 character conversions and indexing conversions. 17 * </p> 18 * <p> 19 * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap, 20 * so searching for strings is a safe operation. Similarly, concatenation is always safe. 21 * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the 22 * values for start and end are on those boundaries, since they arose from operations like 23 * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>. 24 * </p> 25 * <strong>Examples:</strong> 26 * <p> 27 * The following examples illustrate use of some of these methods. 28 * 29 * <pre> 30 * // iteration forwards: Original 31 * for (int i = 0; i < s.length(); ++i) { 32 * char ch = s.charAt(i); 33 * doSomethingWith(ch); 34 * } 35 * 36 * // iteration forwards: Changes for UTF-32 37 * int ch; 38 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { 39 * ch = UTF16.charAt(s, i); 40 * doSomethingWith(ch); 41 * } 42 * 43 * // iteration backwards: Original 44 * for (int i = s.length() - 1; i >= 0; --i) { 45 * char ch = s.charAt(i); 46 * doSomethingWith(ch); 47 * } 48 * 49 * // iteration backwards: Changes for UTF-32 50 * int ch; 51 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { 52 * ch = UTF16.charAt(s, i); 53 * doSomethingWith(ch); 54 * } 55 * </pre> 56 * 57 * <strong>Notes:</strong> 58 * <ul> 59 * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code> 60 * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string. 61 * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16 62 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32 63 * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li> 64 * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a 65 * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16 66 * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>. 67 * </li> 68 * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out 69 * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates 70 * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to 71 * check for validity if desired. </li> 72 * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then 73 * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It 74 * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4, 75 * 5.5). </li> 76 * <li> <strong>Optimization:</strong> The method implementations may need optimization if the 77 * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small 78 * percentage of all the text in the world, the singleton case should always be optimized for. </li> 79 * </ul> 80 * 81 * @author Mark Davis, with help from Markus Scherer 82 * @stable ICU 2.1 83 */ 84 85 public final class UTF16 { 86 // public variables --------------------------------------------------- 87 88 /** 89 * Value returned in {@link #bounds(String, int) bounds()}. 90 * These values are chosen specifically so that it actually represents the position of the 91 * character [offset16 - (value >> 2), offset16 + (value & 3)] 92 * 93 * @stable ICU 2.1 94 */ 95 public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2, 96 TRAIL_SURROGATE_BOUNDARY = 5; 97 98 /** 99 * The lowest Unicode code point value. 100 * 101 * @stable ICU 2.1 102 */ 103 public static final int CODEPOINT_MIN_VALUE = 0; 104 105 /** 106 * The highest Unicode code point value (scalar value) according to the Unicode Standard. 107 * 108 * @stable ICU 2.1 109 */ 110 public static final int CODEPOINT_MAX_VALUE = 0x10ffff; 111 112 /** 113 * The minimum value for Supplementary code points 114 * 115 * @stable ICU 2.1 116 */ 117 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 118 119 /** 120 * Lead surrogate minimum value 121 * 122 * @stable ICU 2.1 123 */ 124 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 125 126 /** 127 * Trail surrogate minimum value 128 * 129 * @stable ICU 2.1 130 */ 131 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 132 133 /** 134 * Lead surrogate maximum value 135 * 136 * @stable ICU 2.1 137 */ 138 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 139 140 /** 141 * Trail surrogate maximum value 142 * 143 * @stable ICU 2.1 144 */ 145 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 146 147 /** 148 * Surrogate minimum value 149 * 150 * @stable ICU 2.1 151 */ 152 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; 153 154 /** 155 * Maximum surrogate value 156 * 157 * @stable ICU 2.1 158 */ 159 public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE; 160 161 /** 162 * Lead surrogate bitmask 163 */ 164 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; 165 166 /** 167 * Trail surrogate bitmask 168 */ 169 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; 170 171 /** 172 * Surrogate bitmask 173 */ 174 private static final int SURROGATE_BITMASK = 0xFFFFF800; 175 176 /** 177 * Lead surrogate bits 178 */ 179 private static final int LEAD_SURROGATE_BITS = 0xD800; 180 181 /** 182 * Trail surrogate bits 183 */ 184 private static final int TRAIL_SURROGATE_BITS = 0xDC00; 185 186 /** 187 * Surrogate bits 188 */ 189 private static final int SURROGATE_BITS = 0xD800; 190 191 // constructor -------------------------------------------------------- 192 193 // /CLOVER:OFF 194 /** 195 * Prevent instance from being created. 196 */ UTF16()197 private UTF16() { 198 } 199 200 // /CLOVER:ON 201 // public method ------------------------------------------------------ 202 203 /** 204 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 205 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 206 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 207 * UCharacter.isLegal()</a></code> 208 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 209 * character will be returned. If a complete supplementary character is not found the incomplete 210 * character will be returned 211 * 212 * @param source Array of UTF-16 chars 213 * @param offset16 UTF-16 offset to the start of the character. 214 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 215 * of that codepoint are the same as in <code>bounds32()</code>. 216 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 217 * @stable ICU 2.1 218 */ charAt(String source, int offset16)219 public static int charAt(String source, int offset16) { 220 char single = source.charAt(offset16); 221 if (single < LEAD_SURROGATE_MIN_VALUE) { 222 return single; 223 } 224 return _charAt(source, offset16, single); 225 } 226 _charAt(String source, int offset16, char single)227 private static int _charAt(String source, int offset16, char single) { 228 if (single > TRAIL_SURROGATE_MAX_VALUE) { 229 return single; 230 } 231 232 // Convert the UTF-16 surrogate pair if necessary. 233 // For simplicity in usage, and because the frequency of pairs is 234 // low, look both directions. 235 236 if (single <= LEAD_SURROGATE_MAX_VALUE) { 237 ++offset16; 238 if (source.length() != offset16) { 239 char trail = source.charAt(offset16); 240 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { 241 return Character.toCodePoint(single, trail); 242 } 243 } 244 } else { 245 --offset16; 246 if (offset16 >= 0) { 247 // single is a trail surrogate so 248 char lead = source.charAt(offset16); 249 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { 250 return Character.toCodePoint(lead, single); 251 } 252 } 253 } 254 return single; // return unmatched surrogate 255 } 256 257 /** 258 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 259 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 260 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 261 * UCharacter.isLegal()</a></code> 262 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 263 * character will be returned. If a complete supplementary character is not found the incomplete 264 * character will be returned 265 * 266 * @param source Array of UTF-16 chars 267 * @param offset16 UTF-16 offset to the start of the character. 268 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 269 * of that codepoint are the same as in <code>bounds32()</code>. 270 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 271 * @stable ICU 2.1 272 */ charAt(CharSequence source, int offset16)273 public static int charAt(CharSequence source, int offset16) { 274 char single = source.charAt(offset16); 275 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { 276 return single; 277 } 278 return _charAt(source, offset16, single); 279 } 280 _charAt(CharSequence source, int offset16, char single)281 private static int _charAt(CharSequence source, int offset16, char single) { 282 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { 283 return single; 284 } 285 286 // Convert the UTF-16 surrogate pair if necessary. 287 // For simplicity in usage, and because the frequency of pairs is 288 // low, look both directions. 289 290 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 291 ++offset16; 292 if (source.length() != offset16) { 293 char trail = source.charAt(offset16); 294 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE 295 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { 296 return Character.toCodePoint(single, trail); 297 } 298 } 299 } else { 300 --offset16; 301 if (offset16 >= 0) { 302 // single is a trail surrogate so 303 char lead = source.charAt(offset16); 304 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE 305 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 306 return Character.toCodePoint(lead, single); 307 } 308 } 309 } 310 return single; // return unmatched surrogate 311 } 312 313 /** 314 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 315 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 316 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 317 * </a></code> 318 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 319 * character will be returned. If a complete supplementary character is not found the incomplete 320 * character will be returned 321 * 322 * @param source UTF-16 chars string buffer 323 * @param offset16 UTF-16 offset to the start of the character. 324 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 325 * of that codepoint are the same as in <code>bounds32()</code>. 326 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 327 * @stable ICU 2.1 328 */ charAt(StringBuffer source, int offset16)329 public static int charAt(StringBuffer source, int offset16) { 330 if (offset16 < 0 || offset16 >= source.length()) { 331 throw new StringIndexOutOfBoundsException(offset16); 332 } 333 334 char single = source.charAt(offset16); 335 if (!isSurrogate(single)) { 336 return single; 337 } 338 339 // Convert the UTF-16 surrogate pair if necessary. 340 // For simplicity in usage, and because the frequency of pairs is 341 // low, look both directions. 342 343 if (single <= LEAD_SURROGATE_MAX_VALUE) { 344 ++offset16; 345 if (source.length() != offset16) { 346 char trail = source.charAt(offset16); 347 if (isTrailSurrogate(trail)) 348 return Character.toCodePoint(single, trail); 349 } 350 } else { 351 --offset16; 352 if (offset16 >= 0) { 353 // single is a trail surrogate so 354 char lead = source.charAt(offset16); 355 if (isLeadSurrogate(lead)) { 356 return Character.toCodePoint(lead, single); 357 } 358 } 359 } 360 return single; // return unmatched surrogate 361 } 362 363 /** 364 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards 365 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 366 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 367 * </a></code> 368 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 369 * character will be returned. If a complete supplementary character is not found the incomplete 370 * character will be returned 371 * 372 * @param source Array of UTF-16 chars 373 * @param start Offset to substring in the source array for analyzing 374 * @param limit Offset to substring in the source array for analyzing 375 * @param offset16 UTF-16 offset relative to start 376 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 377 * of that codepoint are the same as in <code>bounds32()</code>. 378 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit. 379 * @stable ICU 2.1 380 */ charAt(char source[], int start, int limit, int offset16)381 public static int charAt(char source[], int start, int limit, int offset16) { 382 offset16 += start; 383 if (offset16 < start || offset16 >= limit) { 384 throw new ArrayIndexOutOfBoundsException(offset16); 385 } 386 387 char single = source[offset16]; 388 if (!isSurrogate(single)) { 389 return single; 390 } 391 392 // Convert the UTF-16 surrogate pair if necessary. 393 // For simplicity in usage, and because the frequency of pairs is 394 // low, look both directions. 395 if (single <= LEAD_SURROGATE_MAX_VALUE) { 396 offset16++; 397 if (offset16 >= limit) { 398 return single; 399 } 400 char trail = source[offset16]; 401 if (isTrailSurrogate(trail)) { 402 return Character.toCodePoint(single, trail); 403 } 404 } else { // isTrailSurrogate(single), so 405 if (offset16 == start) { 406 return single; 407 } 408 offset16--; 409 char lead = source[offset16]; 410 if (isLeadSurrogate(lead)) 411 return Character.toCodePoint(lead, single); 412 } 413 return single; // return unmatched surrogate 414 } 415 416 /** 417 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 418 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 419 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 420 * </a></code> 421 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 422 * character will be returned. If a complete supplementary character is not found the incomplete 423 * character will be returned 424 * 425 * @param source UTF-16 chars string buffer 426 * @param offset16 UTF-16 offset to the start of the character. 427 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 428 * of that codepoint are the same as in <code>bounds32()</code>. 429 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 430 * @stable ICU 2.1 431 */ charAt(Replaceable source, int offset16)432 public static int charAt(Replaceable source, int offset16) { 433 if (offset16 < 0 || offset16 >= source.length()) { 434 throw new StringIndexOutOfBoundsException(offset16); 435 } 436 437 char single = source.charAt(offset16); 438 if (!isSurrogate(single)) { 439 return single; 440 } 441 442 // Convert the UTF-16 surrogate pair if necessary. 443 // For simplicity in usage, and because the frequency of pairs is 444 // low, look both directions. 445 446 if (single <= LEAD_SURROGATE_MAX_VALUE) { 447 ++offset16; 448 if (source.length() != offset16) { 449 char trail = source.charAt(offset16); 450 if (isTrailSurrogate(trail)) 451 return Character.toCodePoint(single, trail); 452 } 453 } else { 454 --offset16; 455 if (offset16 >= 0) { 456 // single is a trail surrogate so 457 char lead = source.charAt(offset16); 458 if (isLeadSurrogate(lead)) { 459 return Character.toCodePoint(lead, single); 460 } 461 } 462 } 463 return single; // return unmatched surrogate 464 } 465 466 /** 467 * Determines how many chars this char32 requires. If a validity check is required, use <code> 468 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 469 * on char32 before calling. 470 * 471 * @param char32 The input codepoint. 472 * @return 2 if is in supplementary space, otherwise 1. 473 * @stable ICU 2.1 474 */ getCharCount(int char32)475 public static int getCharCount(int char32) { 476 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 477 return 1; 478 } 479 return 2; 480 } 481 482 /** 483 * Returns the type of the boundaries around the char at offset16. Used for random access. 484 * 485 * @param source Text to analyse 486 * @param offset16 UTF-16 offset 487 * @return 488 * <ul> 489 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1] 490 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 491 * are [offset16, offset16 + 2] 492 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 493 * bounds are [offset16 - 1, offset16 + 1] 494 * </ul> 495 * For bit-twiddlers, the return values for these are chosen so that the boundaries 496 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 497 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 498 * @stable ICU 2.1 499 */ bounds(String source, int offset16)500 public static int bounds(String source, int offset16) { 501 char ch = source.charAt(offset16); 502 if (isSurrogate(ch)) { 503 if (isLeadSurrogate(ch)) { 504 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 505 return LEAD_SURROGATE_BOUNDARY; 506 } 507 } else { 508 // isTrailSurrogate(ch), so 509 --offset16; 510 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 511 return TRAIL_SURROGATE_BOUNDARY; 512 } 513 } 514 } 515 return SINGLE_CHAR_BOUNDARY; 516 } 517 518 /** 519 * Returns the type of the boundaries around the char at offset16. Used for random access. 520 * 521 * @param source String buffer to analyse 522 * @param offset16 UTF16 offset 523 * @return 524 * <ul> 525 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1] 526 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 527 * are [offset16, offset16 + 2] 528 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 529 * bounds are [offset16 - 1, offset16 + 1] 530 * </ul> 531 * For bit-twiddlers, the return values for these are chosen so that the boundaries 532 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 533 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 534 * @stable ICU 2.1 535 */ bounds(StringBuffer source, int offset16)536 public static int bounds(StringBuffer source, int offset16) { 537 char ch = source.charAt(offset16); 538 if (isSurrogate(ch)) { 539 if (isLeadSurrogate(ch)) { 540 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 541 return LEAD_SURROGATE_BOUNDARY; 542 } 543 } else { 544 // isTrailSurrogate(ch), so 545 --offset16; 546 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 547 return TRAIL_SURROGATE_BOUNDARY; 548 } 549 } 550 } 551 return SINGLE_CHAR_BOUNDARY; 552 } 553 554 /** 555 * Returns the type of the boundaries around the char at offset16. Used for random access. Note 556 * that the boundaries are determined with respect to the subarray, hence the char array 557 * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1. 558 * 559 * @param source Char array to analyse 560 * @param start Offset to substring in the source array for analyzing 561 * @param limit Offset to substring in the source array for analyzing 562 * @param offset16 UTF16 offset relative to start 563 * @return 564 * <ul> 565 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are 566 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 567 * are [offset16, offset16 + 2] 568 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 569 * bounds are [offset16 - 1, offset16 + 1] 570 * </ul> 571 * For bit-twiddlers, the boundary values for these are chosen so that the boundaries 572 * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)]. 573 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 574 * @stable ICU 2.1 575 */ bounds(char source[], int start, int limit, int offset16)576 public static int bounds(char source[], int start, int limit, int offset16) { 577 offset16 += start; 578 if (offset16 < start || offset16 >= limit) { 579 throw new ArrayIndexOutOfBoundsException(offset16); 580 } 581 char ch = source[offset16]; 582 if (isSurrogate(ch)) { 583 if (isLeadSurrogate(ch)) { 584 ++offset16; 585 if (offset16 < limit && isTrailSurrogate(source[offset16])) { 586 return LEAD_SURROGATE_BOUNDARY; 587 } 588 } else { // isTrailSurrogate(ch), so 589 --offset16; 590 if (offset16 >= start && isLeadSurrogate(source[offset16])) { 591 return TRAIL_SURROGATE_BOUNDARY; 592 } 593 } 594 } 595 return SINGLE_CHAR_BOUNDARY; 596 } 597 598 /** 599 * Determines whether the code value is a surrogate. 600 * 601 * @param char16 The input character. 602 * @return true If the input character is a surrogate. 603 * @stable ICU 2.1 604 */ isSurrogate(char char16)605 public static boolean isSurrogate(char char16) { 606 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; 607 } 608 609 /** 610 * Determines whether the character is a trail surrogate. 611 * 612 * @param char16 The input character. 613 * @return true If the input character is a trail surrogate. 614 * @stable ICU 2.1 615 */ isTrailSurrogate(char char16)616 public static boolean isTrailSurrogate(char char16) { 617 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; 618 } 619 620 /** 621 * Determines whether the character is a lead surrogate. 622 * 623 * @param char16 The input character. 624 * @return true If the input character is a lead surrogate 625 * @stable ICU 2.1 626 */ isLeadSurrogate(char char16)627 public static boolean isLeadSurrogate(char char16) { 628 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; 629 } 630 631 /** 632 * Returns the lead surrogate. If a validity check is required, use 633 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 634 * before calling. 635 * 636 * @param char32 The input character. 637 * @return lead surrogate if the getCharCount(ch) is 2; <br> 638 * and 0 otherwise (note: 0 is not a valid lead surrogate). 639 * @stable ICU 2.1 640 */ getLeadSurrogate(int char32)641 public static char getLeadSurrogate(int char32) { 642 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 643 return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_)); 644 } 645 return 0; 646 } 647 648 /** 649 * Returns the trail surrogate. If a validity check is required, use 650 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 651 * before calling. 652 * 653 * @param char32 The input character. 654 * @return the trail surrogate if the getCharCount(ch) is 2; <br> 655 * otherwise the character itself 656 * @stable ICU 2.1 657 */ getTrailSurrogate(int char32)658 public static char getTrailSurrogate(int char32) { 659 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 660 return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_)); 661 } 662 return (char) char32; 663 } 664 665 /** 666 * Convenience method corresponding to String.valueOf(char). Returns a one or two char string 667 * containing the UTF-32 value in UTF16 format. If a validity check is required, use 668 * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before calling. 669 * 670 * @param char32 The input character. 671 * @return string value of char32 in UTF16 format 672 * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint. 673 * @stable ICU 2.1 674 */ valueOf(int char32)675 public static String valueOf(int char32) { 676 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 677 throw new IllegalArgumentException("Illegal codepoint"); 678 } 679 return toString(char32); 680 } 681 682 /** 683 * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or 684 * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate 685 * character, the whole supplementary codepoint will be returned. If a validity check is 686 * required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the 687 * codepoint at offset16 before calling. The result returned will be a newly created String 688 * obtained by calling source.substring(..) with the appropriate indexes. 689 * 690 * @param source The input string. 691 * @param offset16 The UTF16 index to the codepoint in source 692 * @return string value of char32 in UTF16 format 693 * @stable ICU 2.1 694 */ valueOf(String source, int offset16)695 public static String valueOf(String source, int offset16) { 696 switch (bounds(source, offset16)) { 697 case LEAD_SURROGATE_BOUNDARY: 698 return source.substring(offset16, offset16 + 2); 699 case TRAIL_SURROGATE_BOUNDARY: 700 return source.substring(offset16 - 1, offset16 + 1); 701 default: 702 return source.substring(offset16, offset16 + 1); 703 } 704 } 705 706 /** 707 * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a 708 * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a 709 * surrogate character, the whole supplementary codepoint will be returned. If a validity check 710 * is required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on 711 * the codepoint at offset16 before calling. The result returned will be a newly created String 712 * obtained by calling source.substring(..) with the appropriate indexes. 713 * 714 * @param source The input string buffer. 715 * @param offset16 The UTF16 index to the codepoint in source 716 * @return string value of char32 in UTF16 format 717 * @stable ICU 2.1 718 */ valueOf(StringBuffer source, int offset16)719 public static String valueOf(StringBuffer source, int offset16) { 720 switch (bounds(source, offset16)) { 721 case LEAD_SURROGATE_BOUNDARY: 722 return source.substring(offset16, offset16 + 2); 723 case TRAIL_SURROGATE_BOUNDARY: 724 return source.substring(offset16 - 1, offset16 + 1); 725 default: 726 return source.substring(offset16, offset16 + 1); 727 } 728 } 729 730 /** 731 * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16 732 * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be 733 * returned, except when either the leading or trailing surrogate character lies out of the 734 * specified subarray. In the latter case, only the surrogate character within bounds will be 735 * returned. If a validity check is required, use 736 * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the codepoint at 737 * offset16 before calling. The result returned will be a newly created String containing the 738 * relevant characters. 739 * 740 * @param source The input char array. 741 * @param start Start index of the subarray 742 * @param limit End index of the subarray 743 * @param offset16 The UTF16 index to the codepoint in source relative to start 744 * @return string value of char32 in UTF16 format 745 * @stable ICU 2.1 746 */ valueOf(char source[], int start, int limit, int offset16)747 public static String valueOf(char source[], int start, int limit, int offset16) { 748 switch (bounds(source, start, limit, offset16)) { 749 case LEAD_SURROGATE_BOUNDARY: 750 return new String(source, start + offset16, 2); 751 case TRAIL_SURROGATE_BOUNDARY: 752 return new String(source, start + offset16 - 1, 2); 753 } 754 return new String(source, start + offset16, 1); 755 } 756 757 /** 758 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 759 * the {@link UTF16 class description} for notes on roundtripping. 760 * 761 * @param source The UTF-16 string 762 * @param offset32 UTF-32 offset 763 * @return UTF-16 offset 764 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 765 * @stable ICU 2.1 766 */ findOffsetFromCodePoint(String source, int offset32)767 public static int findOffsetFromCodePoint(String source, int offset32) { 768 char ch; 769 int size = source.length(), result = 0, count = offset32; 770 if (offset32 < 0 || offset32 > size) { 771 throw new StringIndexOutOfBoundsException(offset32); 772 } 773 while (result < size && count > 0) { 774 ch = source.charAt(result); 775 if (isLeadSurrogate(ch) && ((result + 1) < size) 776 && isTrailSurrogate(source.charAt(result + 1))) { 777 result++; 778 } 779 780 count--; 781 result++; 782 } 783 if (count != 0) { 784 throw new StringIndexOutOfBoundsException(offset32); 785 } 786 return result; 787 } 788 789 /** 790 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 791 * the {@link UTF16 class description} for notes on roundtripping. 792 * 793 * @param source The UTF-16 string buffer 794 * @param offset32 UTF-32 offset 795 * @return UTF-16 offset 796 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 797 * @stable ICU 2.1 798 */ findOffsetFromCodePoint(StringBuffer source, int offset32)799 public static int findOffsetFromCodePoint(StringBuffer source, int offset32) { 800 char ch; 801 int size = source.length(), result = 0, count = offset32; 802 if (offset32 < 0 || offset32 > size) { 803 throw new StringIndexOutOfBoundsException(offset32); 804 } 805 while (result < size && count > 0) { 806 ch = source.charAt(result); 807 if (isLeadSurrogate(ch) && ((result + 1) < size) 808 && isTrailSurrogate(source.charAt(result + 1))) { 809 result++; 810 } 811 812 count--; 813 result++; 814 } 815 if (count != 0) { 816 throw new StringIndexOutOfBoundsException(offset32); 817 } 818 return result; 819 } 820 821 /** 822 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 823 * the {@link UTF16 class description} for notes on roundtripping. 824 * 825 * @param source The UTF-16 char array whose substring is to be analysed 826 * @param start Offset of the substring to be analysed 827 * @param limit Offset of the substring to be analysed 828 * @param offset32 UTF-32 offset relative to start 829 * @return UTF-16 offset relative to start 830 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 831 * @stable ICU 2.1 832 */ findOffsetFromCodePoint(char source[], int start, int limit, int offset32)833 public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) { 834 char ch; 835 int result = start, count = offset32; 836 if (offset32 > limit - start) { 837 throw new ArrayIndexOutOfBoundsException(offset32); 838 } 839 while (result < limit && count > 0) { 840 ch = source[result]; 841 if (isLeadSurrogate(ch) && ((result + 1) < limit) 842 && isTrailSurrogate(source[result + 1])) { 843 result++; 844 } 845 846 count--; 847 result++; 848 } 849 if (count != 0) { 850 throw new ArrayIndexOutOfBoundsException(offset32); 851 } 852 return result - start; 853 } 854 855 /** 856 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given 857 * UTF-16 offset. Used for random access. See the {@link UTF16 class description} for 858 * notes on roundtripping.<br> 859 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 860 * of the <strong>lead</strong> of the pair is returned. </i> 861 * <p> 862 * To find the UTF-32 length of a string, use: 863 * 864 * <pre> 865 * len32 = countCodePoint(source, source.length()); 866 * </pre> 867 * 868 * @param source Text to analyse 869 * @param offset16 UTF-16 offset < source text length. 870 * @return UTF-32 offset 871 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 872 * @stable ICU 2.1 873 */ findCodePointOffset(String source, int offset16)874 public static int findCodePointOffset(String source, int offset16) { 875 if (offset16 < 0 || offset16 > source.length()) { 876 throw new StringIndexOutOfBoundsException(offset16); 877 } 878 879 int result = 0; 880 char ch; 881 boolean hadLeadSurrogate = false; 882 883 for (int i = 0; i < offset16; ++i) { 884 ch = source.charAt(i); 885 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 886 hadLeadSurrogate = false; // count valid trail as zero 887 } else { 888 hadLeadSurrogate = isLeadSurrogate(ch); 889 ++result; // count others as 1 890 } 891 } 892 893 if (offset16 == source.length()) { 894 return result; 895 } 896 897 // end of source being the less significant surrogate character 898 // shift result back to the start of the supplementary character 899 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 900 result--; 901 } 902 903 return result; 904 } 905 906 /** 907 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 908 * offset. Used for random access. See the {@link UTF16 class description} for notes on 909 * roundtripping.<br> 910 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 911 * of the <strong>lead</strong> of the pair is returned. </i> 912 * <p> 913 * To find the UTF-32 length of a string, use: 914 * 915 * <pre> 916 * len32 = countCodePoint(source); 917 * </pre> 918 * 919 * @param source Text to analyse 920 * @param offset16 UTF-16 offset < source text length. 921 * @return UTF-32 offset 922 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 923 * @stable ICU 2.1 924 */ findCodePointOffset(StringBuffer source, int offset16)925 public static int findCodePointOffset(StringBuffer source, int offset16) { 926 if (offset16 < 0 || offset16 > source.length()) { 927 throw new StringIndexOutOfBoundsException(offset16); 928 } 929 930 int result = 0; 931 char ch; 932 boolean hadLeadSurrogate = false; 933 934 for (int i = 0; i < offset16; ++i) { 935 ch = source.charAt(i); 936 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 937 hadLeadSurrogate = false; // count valid trail as zero 938 } else { 939 hadLeadSurrogate = isLeadSurrogate(ch); 940 ++result; // count others as 1 941 } 942 } 943 944 if (offset16 == source.length()) { 945 return result; 946 } 947 948 // end of source being the less significant surrogate character 949 // shift result back to the start of the supplementary character 950 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 951 result--; 952 } 953 954 return result; 955 } 956 957 /** 958 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 959 * offset. Used for random access. See the {@link UTF16 class description} for notes on 960 * roundtripping.<br> 961 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 962 * of the <strong>lead</strong> of the pair is returned. </i> 963 * <p> 964 * To find the UTF-32 length of a substring, use: 965 * 966 * <pre> 967 * len32 = countCodePoint(source, start, limit); 968 * </pre> 969 * 970 * @param source Text to analyse 971 * @param start Offset of the substring 972 * @param limit Offset of the substring 973 * @param offset16 UTF-16 relative to start 974 * @return UTF-32 offset relative to start 975 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 976 * @stable ICU 2.1 977 */ findCodePointOffset(char source[], int start, int limit, int offset16)978 public static int findCodePointOffset(char source[], int start, int limit, int offset16) { 979 offset16 += start; 980 if (offset16 > limit) { 981 throw new StringIndexOutOfBoundsException(offset16); 982 } 983 984 int result = 0; 985 char ch; 986 boolean hadLeadSurrogate = false; 987 988 for (int i = start; i < offset16; ++i) { 989 ch = source[i]; 990 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 991 hadLeadSurrogate = false; // count valid trail as zero 992 } else { 993 hadLeadSurrogate = isLeadSurrogate(ch); 994 ++result; // count others as 1 995 } 996 } 997 998 if (offset16 == limit) { 999 return result; 1000 } 1001 1002 // end of source being the less significant surrogate character 1003 // shift result back to the start of the supplementary character 1004 if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) { 1005 result--; 1006 } 1007 1008 return result; 1009 } 1010 1011 /** 1012 * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required, 1013 * use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before 1014 * calling. 1015 * 1016 * @param target The buffer to append to 1017 * @param char32 Value to append. 1018 * @return the updated StringBuffer 1019 * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints 1020 * @stable ICU 2.1 1021 */ append(StringBuffer target, int char32)1022 public static StringBuffer append(StringBuffer target, int char32) { 1023 // Check for irregular values 1024 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1025 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); 1026 } 1027 1028 // Write the UTF-16 values 1029 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 1030 target.append(getLeadSurrogate(char32)); 1031 target.append(getTrailSurrogate(char32)); 1032 } else { 1033 target.append((char) char32); 1034 } 1035 return target; 1036 } 1037 1038 /** 1039 * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a 1040 * convenience. 1041 * 1042 * @param target The buffer to append to 1043 * @param cp The code point to append 1044 * @return the updated StringBuffer 1045 * @throws IllegalArgumentException If cp is not a valid code point 1046 * @stable ICU 3.0 1047 */ appendCodePoint(StringBuffer target, int cp)1048 public static StringBuffer appendCodePoint(StringBuffer target, int cp) { 1049 return append(target, cp); 1050 } 1051 1052 /** 1053 * Adds a codepoint to offset16 position of the argument char array. 1054 * 1055 * @param target Char array to be append with the new code point 1056 * @param limit UTF16 offset which the codepoint will be appended. 1057 * @param char32 Code point to be appended 1058 * @return offset after char32 in the array. 1059 * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not 1060 * lie within the range of the Unicode codepoints. 1061 * @stable ICU 2.1 1062 */ append(char[] target, int limit, int char32)1063 public static int append(char[] target, int limit, int char32) { 1064 // Check for irregular values 1065 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1066 throw new IllegalArgumentException("Illegal codepoint"); 1067 } 1068 // Write the UTF-16 values 1069 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 1070 target[limit++] = getLeadSurrogate(char32); 1071 target[limit++] = getTrailSurrogate(char32); 1072 } else { 1073 target[limit++] = (char) char32; 1074 } 1075 return limit; 1076 } 1077 1078 /** 1079 * Number of codepoints in a UTF16 String 1080 * 1081 * @param source UTF16 string 1082 * @return number of codepoint in string 1083 * @stable ICU 2.1 1084 */ countCodePoint(String source)1085 public static int countCodePoint(String source) { 1086 if (source == null || source.length() == 0) { 1087 return 0; 1088 } 1089 return findCodePointOffset(source, source.length()); 1090 } 1091 1092 /** 1093 * Number of codepoints in a UTF16 String buffer 1094 * 1095 * @param source UTF16 string buffer 1096 * @return number of codepoint in string 1097 * @stable ICU 2.1 1098 */ countCodePoint(StringBuffer source)1099 public static int countCodePoint(StringBuffer source) { 1100 if (source == null || source.length() == 0) { 1101 return 0; 1102 } 1103 return findCodePointOffset(source, source.length()); 1104 } 1105 1106 /** 1107 * Number of codepoints in a UTF16 char array substring 1108 * 1109 * @param source UTF16 char array 1110 * @param start Offset of the substring 1111 * @param limit Offset of the substring 1112 * @return number of codepoint in the substring 1113 * @exception IndexOutOfBoundsException If start and limit are not valid. 1114 * @stable ICU 2.1 1115 */ countCodePoint(char source[], int start, int limit)1116 public static int countCodePoint(char source[], int start, int limit) { 1117 if (source == null || source.length == 0) { 1118 return 0; 1119 } 1120 return findCodePointOffset(source, start, limit, limit - start); 1121 } 1122 1123 /** 1124 * Set a code point into a UTF16 position. Adjusts target according if we are replacing a 1125 * non-supplementary codepoint with a supplementary and vice versa. 1126 * 1127 * @param target Stringbuffer 1128 * @param offset16 UTF16 position to insert into 1129 * @param char32 Code point 1130 * @stable ICU 2.1 1131 */ setCharAt(StringBuffer target, int offset16, int char32)1132 public static void setCharAt(StringBuffer target, int offset16, int char32) { 1133 int count = 1; 1134 char single = target.charAt(offset16); 1135 1136 if (isSurrogate(single)) { 1137 // pairs of the surrogate with offset16 at the lead char found 1138 if (isLeadSurrogate(single) && (target.length() > offset16 + 1) 1139 && isTrailSurrogate(target.charAt(offset16 + 1))) { 1140 count++; 1141 } else { 1142 // pairs of the surrogate with offset16 at the trail char 1143 // found 1144 if (isTrailSurrogate(single) && (offset16 > 0) 1145 && isLeadSurrogate(target.charAt(offset16 - 1))) { 1146 offset16--; 1147 count++; 1148 } 1149 } 1150 } 1151 target.replace(offset16, offset16 + count, valueOf(char32)); 1152 } 1153 1154 /** 1155 * Set a code point into a UTF16 position in a char array. Adjusts target according if we are 1156 * replacing a non-supplementary codepoint with a supplementary and vice versa. 1157 * 1158 * @param target char array 1159 * @param limit numbers of valid chars in target, different from target.length. limit counts the 1160 * number of chars in target that represents a string, not the size of array target. 1161 * @param offset16 UTF16 position to insert into 1162 * @param char32 code point 1163 * @return new number of chars in target that represents a string 1164 * @exception IndexOutOfBoundsException if offset16 is out of range 1165 * @stable ICU 2.1 1166 */ setCharAt(char target[], int limit, int offset16, int char32)1167 public static int setCharAt(char target[], int limit, int offset16, int char32) { 1168 if (offset16 >= limit) { 1169 throw new ArrayIndexOutOfBoundsException(offset16); 1170 } 1171 int count = 1; 1172 char single = target[offset16]; 1173 1174 if (isSurrogate(single)) { 1175 // pairs of the surrogate with offset16 at the lead char found 1176 if (isLeadSurrogate(single) && (target.length > offset16 + 1) 1177 && isTrailSurrogate(target[offset16 + 1])) { 1178 count++; 1179 } else { 1180 // pairs of the surrogate with offset16 at the trail char 1181 // found 1182 if (isTrailSurrogate(single) && (offset16 > 0) 1183 && isLeadSurrogate(target[offset16 - 1])) { 1184 offset16--; 1185 count++; 1186 } 1187 } 1188 } 1189 1190 String str = valueOf(char32); 1191 int result = limit; 1192 int strlength = str.length(); 1193 target[offset16] = str.charAt(0); 1194 if (count == strlength) { 1195 if (count == 2) { 1196 target[offset16 + 1] = str.charAt(1); 1197 } 1198 } else { 1199 // this is not exact match in space, we'll have to do some 1200 // shifting 1201 System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit 1202 - (offset16 + count)); 1203 if (count < strlength) { 1204 // char32 is a supplementary character trying to squeeze into 1205 // a non-supplementary space 1206 target[offset16 + 1] = str.charAt(1); 1207 result++; 1208 if (result < target.length) { 1209 target[result] = 0; 1210 } 1211 } else { 1212 // char32 is a non-supplementary character trying to fill 1213 // into a supplementary space 1214 result--; 1215 target[result] = 0; 1216 } 1217 } 1218 return result; 1219 } 1220 1221 /** 1222 * Shifts offset16 by the argument number of codepoints 1223 * 1224 * @param source string 1225 * @param offset16 UTF16 position to shift 1226 * @param shift32 number of codepoints to shift 1227 * @return new shifted offset16 1228 * @exception IndexOutOfBoundsException if the new offset16 is out of bounds. 1229 * @stable ICU 2.1 1230 */ moveCodePointOffset(String source, int offset16, int shift32)1231 public static int moveCodePointOffset(String source, int offset16, int shift32) { 1232 int result = offset16; 1233 int size = source.length(); 1234 int count; 1235 char ch; 1236 if (offset16 < 0 || offset16 > size) { 1237 throw new StringIndexOutOfBoundsException(offset16); 1238 } 1239 if (shift32 > 0) { 1240 if (shift32 + offset16 > size) { 1241 throw new StringIndexOutOfBoundsException(offset16); 1242 } 1243 count = shift32; 1244 while (result < size && count > 0) { 1245 ch = source.charAt(result); 1246 if (isLeadSurrogate(ch) && ((result + 1) < size) 1247 && isTrailSurrogate(source.charAt(result + 1))) { 1248 result++; 1249 } 1250 count--; 1251 result++; 1252 } 1253 } else { 1254 if (offset16 + shift32 < 0) { 1255 throw new StringIndexOutOfBoundsException(offset16); 1256 } 1257 for (count = -shift32; count > 0; count--) { 1258 result--; 1259 if (result < 0) { 1260 break; 1261 } 1262 ch = source.charAt(result); 1263 if (isTrailSurrogate(ch) && result > 0 1264 && isLeadSurrogate(source.charAt(result - 1))) { 1265 result--; 1266 } 1267 } 1268 } 1269 if (count != 0) { 1270 throw new StringIndexOutOfBoundsException(shift32); 1271 } 1272 return result; 1273 } 1274 1275 /** 1276 * Shifts offset16 by the argument number of codepoints 1277 * 1278 * @param source String buffer 1279 * @param offset16 UTF16 position to shift 1280 * @param shift32 Number of codepoints to shift 1281 * @return new shifted offset16 1282 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds. 1283 * @stable ICU 2.1 1284 */ moveCodePointOffset(StringBuffer source, int offset16, int shift32)1285 public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) { 1286 int result = offset16; 1287 int size = source.length(); 1288 int count; 1289 char ch; 1290 if (offset16 < 0 || offset16 > size) { 1291 throw new StringIndexOutOfBoundsException(offset16); 1292 } 1293 if (shift32 > 0) { 1294 if (shift32 + offset16 > size) { 1295 throw new StringIndexOutOfBoundsException(offset16); 1296 } 1297 count = shift32; 1298 while (result < size && count > 0) { 1299 ch = source.charAt(result); 1300 if (isLeadSurrogate(ch) && ((result + 1) < size) 1301 && isTrailSurrogate(source.charAt(result + 1))) { 1302 result++; 1303 } 1304 count--; 1305 result++; 1306 } 1307 } else { 1308 if (offset16 + shift32 < 0) { 1309 throw new StringIndexOutOfBoundsException(offset16); 1310 } 1311 for (count = -shift32; count > 0; count--) { 1312 result--; 1313 if (result < 0) { 1314 break; 1315 } 1316 ch = source.charAt(result); 1317 if (isTrailSurrogate(ch) && result > 0 1318 && isLeadSurrogate(source.charAt(result - 1))) { 1319 result--; 1320 } 1321 } 1322 } 1323 if (count != 0) { 1324 throw new StringIndexOutOfBoundsException(shift32); 1325 } 1326 return result; 1327 } 1328 1329 /** 1330 * Shifts offset16 by the argument number of codepoints within a subarray. 1331 * 1332 * @param source Char array 1333 * @param start Position of the subarray to be performed on 1334 * @param limit Position of the subarray to be performed on 1335 * @param offset16 UTF16 position to shift relative to start 1336 * @param shift32 Number of codepoints to shift 1337 * @return new shifted offset16 relative to start 1338 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the 1339 * subarray bounds are out of range. 1340 * @stable ICU 2.1 1341 */ moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32)1342 public static int moveCodePointOffset(char source[], int start, int limit, int offset16, 1343 int shift32) { 1344 int size = source.length; 1345 int count; 1346 char ch; 1347 int result = offset16 + start; 1348 if (start < 0 || limit < start) { 1349 throw new StringIndexOutOfBoundsException(start); 1350 } 1351 if (limit > size) { 1352 throw new StringIndexOutOfBoundsException(limit); 1353 } 1354 if (offset16 < 0 || result > limit) { 1355 throw new StringIndexOutOfBoundsException(offset16); 1356 } 1357 if (shift32 > 0) { 1358 if (shift32 + result > size) { 1359 throw new StringIndexOutOfBoundsException(result); 1360 } 1361 count = shift32; 1362 while (result < limit && count > 0) { 1363 ch = source[result]; 1364 if (isLeadSurrogate(ch) && (result + 1 < limit) 1365 && isTrailSurrogate(source[result + 1])) { 1366 result++; 1367 } 1368 count--; 1369 result++; 1370 } 1371 } else { 1372 if (result + shift32 < start) { 1373 throw new StringIndexOutOfBoundsException(result); 1374 } 1375 for (count = -shift32; count > 0; count--) { 1376 result--; 1377 if (result < start) { 1378 break; 1379 } 1380 ch = source[result]; 1381 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { 1382 result--; 1383 } 1384 } 1385 } 1386 if (count != 0) { 1387 throw new StringIndexOutOfBoundsException(shift32); 1388 } 1389 result -= start; 1390 return result; 1391 } 1392 1393 /** 1394 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1395 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1396 * codepoint. The length of target increases by one if codepoint is non-supplementary, 2 1397 * otherwise. 1398 * <p> 1399 * The overall effect is exactly as if the argument were converted to a string by the method 1400 * valueOf(char) and the characters in that string were then inserted into target at the 1401 * position indicated by offset16. 1402 * </p> 1403 * <p> 1404 * The offset argument must be greater than or equal to 0, and less than or equal to the length 1405 * of source. 1406 * 1407 * @param target String buffer to insert to 1408 * @param offset16 Offset which char32 will be inserted in 1409 * @param char32 Codepoint to be inserted 1410 * @return a reference to target 1411 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1412 * @stable ICU 2.1 1413 */ insert(StringBuffer target, int offset16, int char32)1414 public static StringBuffer insert(StringBuffer target, int offset16, int char32) { 1415 String str = valueOf(char32); 1416 if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1417 offset16++; 1418 } 1419 target.insert(offset16, str); 1420 return target; 1421 } 1422 1423 /** 1424 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1425 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1426 * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise. 1427 * <p> 1428 * The overall effect is exactly as if the argument were converted to a string by the method 1429 * valueOf(char) and the characters in that string were then inserted into target at the 1430 * position indicated by offset16. 1431 * </p> 1432 * <p> 1433 * The offset argument must be greater than or equal to 0, and less than or equal to the limit. 1434 * 1435 * @param target Char array to insert to 1436 * @param limit End index of the char array, limit <= target.length 1437 * @param offset16 Offset which char32 will be inserted in 1438 * @param char32 Codepoint to be inserted 1439 * @return new limit size 1440 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1441 * @stable ICU 2.1 1442 */ insert(char target[], int limit, int offset16, int char32)1443 public static int insert(char target[], int limit, int offset16, int char32) { 1444 String str = valueOf(char32); 1445 if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1446 offset16++; 1447 } 1448 int size = str.length(); 1449 if (limit + size > target.length) { 1450 throw new ArrayIndexOutOfBoundsException(offset16 + size); 1451 } 1452 System.arraycopy(target, offset16, target, offset16 + size, limit - offset16); 1453 target[offset16] = str.charAt(0); 1454 if (size == 2) { 1455 target[offset16 + 1] = str.charAt(1); 1456 } 1457 return limit + size; 1458 } 1459 1460 /** 1461 * Removes the codepoint at the specified position in this target (shortening target by 1 1462 * character if the codepoint is a non-supplementary, 2 otherwise). 1463 * 1464 * @param target String buffer to remove codepoint from 1465 * @param offset16 Offset which the codepoint will be removed 1466 * @return a reference to target 1467 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1468 * @stable ICU 2.1 1469 */ delete(StringBuffer target, int offset16)1470 public static StringBuffer delete(StringBuffer target, int offset16) { 1471 int count = 1; 1472 switch (bounds(target, offset16)) { 1473 case LEAD_SURROGATE_BOUNDARY: 1474 count++; 1475 break; 1476 case TRAIL_SURROGATE_BOUNDARY: 1477 count++; 1478 offset16--; 1479 break; 1480 } 1481 target.delete(offset16, offset16 + count); 1482 return target; 1483 } 1484 1485 /** 1486 * Removes the codepoint at the specified position in this target (shortening target by 1 1487 * character if the codepoint is a non-supplementary, 2 otherwise). 1488 * 1489 * @param target String buffer to remove codepoint from 1490 * @param limit End index of the char array, limit <= target.length 1491 * @param offset16 Offset which the codepoint will be removed 1492 * @return a new limit size 1493 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1494 * @stable ICU 2.1 1495 */ delete(char target[], int limit, int offset16)1496 public static int delete(char target[], int limit, int offset16) { 1497 int count = 1; 1498 switch (bounds(target, 0, limit, offset16)) { 1499 case LEAD_SURROGATE_BOUNDARY: 1500 count++; 1501 break; 1502 case TRAIL_SURROGATE_BOUNDARY: 1503 count++; 1504 offset16--; 1505 break; 1506 } 1507 System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count)); 1508 target[limit - count] = 0; 1509 return limit - count; 1510 } 1511 1512 /** 1513 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1514 * the argument codepoint. I.e., the smallest index <code>i</code> such that 1515 * <code>UTF16.charAt(source, i) == 1516 * char32</code> is true. 1517 * <p> 1518 * If no such character occurs in this string, then -1 is returned. 1519 * </p> 1520 * <p> 1521 * Examples:<br> 1522 * UTF16.indexOf("abc", 'a') returns 0<br> 1523 * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1524 * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1525 * </p> 1526 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1527 * characters to its fullest. 1528 * 1529 * @param source UTF16 format Unicode string that will be searched 1530 * @param char32 Codepoint to search for 1531 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1532 * -1 if the codepoint does not occur. 1533 * @stable ICU 2.6 1534 */ indexOf(String source, int char32)1535 public static int indexOf(String source, int char32) { 1536 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1537 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1538 } 1539 // non-surrogate bmp 1540 if (char32 < LEAD_SURROGATE_MIN_VALUE 1541 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1542 return source.indexOf((char) char32); 1543 } 1544 // surrogate 1545 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1546 int result = source.indexOf((char) char32); 1547 if (result >= 0) { 1548 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1549 && isTrailSurrogate(source.charAt(result + 1))) { 1550 return indexOf(source, char32, result + 1); 1551 } 1552 // trail surrogate 1553 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1554 return indexOf(source, char32, result + 1); 1555 } 1556 } 1557 return result; 1558 } 1559 // supplementary 1560 String char32str = toString(char32); 1561 return source.indexOf(char32str); 1562 } 1563 1564 /** 1565 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1566 * the argument string str. This method is implemented based on codepoints, hence a "lead 1567 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1568 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1569 * character before str found at in source will not have a valid match. Vice versa for lead 1570 * surrogates that ends str. See example below. 1571 * <p> 1572 * If no such string str occurs in this source, then -1 is returned. 1573 * </p> 1574 * <p> 1575 * Examples:<br> 1576 * UTF16.indexOf("abc", "ab") returns 0<br> 1577 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1578 * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1579 * </p> 1580 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1581 * characters to its fullest. 1582 * 1583 * @param source UTF16 format Unicode string that will be searched 1584 * @param str UTF16 format Unicode string to search for 1585 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1586 * -1 if the codepoint does not occur. 1587 * @stable ICU 2.6 1588 */ indexOf(String source, String str)1589 public static int indexOf(String source, String str) { 1590 int strLength = str.length(); 1591 // non-surrogate ends 1592 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1593 return source.indexOf(str); 1594 } 1595 1596 int result = source.indexOf(str); 1597 int resultEnd = result + strLength; 1598 if (result >= 0) { 1599 // check last character 1600 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1601 && isTrailSurrogate(source.charAt(resultEnd + 1))) { 1602 return indexOf(source, str, resultEnd + 1); 1603 } 1604 // check first character which is a trail surrogate 1605 if (isTrailSurrogate(str.charAt(0)) && result > 0 1606 && isLeadSurrogate(source.charAt(result - 1))) { 1607 return indexOf(source, str, resultEnd + 1); 1608 } 1609 } 1610 return result; 1611 } 1612 1613 /** 1614 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1615 * the argument codepoint. I.e., the smallest index i such that: <br> 1616 * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true. 1617 * <p> 1618 * If no such character occurs in this string, then -1 is returned. 1619 * </p> 1620 * <p> 1621 * Examples:<br> 1622 * UTF16.indexOf("abc", 'a', 1) returns -1<br> 1623 * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br> 1624 * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br> 1625 * </p> 1626 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1627 * characters to its fullest. 1628 * 1629 * @param source UTF16 format Unicode string that will be searched 1630 * @param char32 Codepoint to search for 1631 * @param fromIndex The index to start the search from. 1632 * @return the index of the first occurrence of the codepoint in the argument Unicode string at 1633 * or after fromIndex, or -1 if the codepoint does not occur. 1634 * @stable ICU 2.6 1635 */ indexOf(String source, int char32, int fromIndex)1636 public static int indexOf(String source, int char32, int fromIndex) { 1637 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1638 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1639 } 1640 // non-surrogate bmp 1641 if (char32 < LEAD_SURROGATE_MIN_VALUE 1642 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1643 return source.indexOf((char) char32, fromIndex); 1644 } 1645 // surrogate 1646 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1647 int result = source.indexOf((char) char32, fromIndex); 1648 if (result >= 0) { 1649 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1650 && isTrailSurrogate(source.charAt(result + 1))) { 1651 return indexOf(source, char32, result + 1); 1652 } 1653 // trail surrogate 1654 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1655 return indexOf(source, char32, result + 1); 1656 } 1657 } 1658 return result; 1659 } 1660 // supplementary 1661 String char32str = toString(char32); 1662 return source.indexOf(char32str, fromIndex); 1663 } 1664 1665 /** 1666 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1667 * the argument string str. This method is implemented based on codepoints, hence a "lead 1668 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1669 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1670 * character before str found at in source will not have a valid match. Vice versa for lead 1671 * surrogates that ends str. See example below. 1672 * <p> 1673 * If no such string str occurs in this source, then -1 is returned. 1674 * </p> 1675 * <p> 1676 * Examples:<br> 1677 * UTF16.indexOf("abc", "ab", 0) returns 0<br> 1678 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br> 1679 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br> 1680 * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br> 1681 * </p> 1682 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1683 * characters to its fullest. 1684 * 1685 * @param source UTF16 format Unicode string that will be searched 1686 * @param str UTF16 format Unicode string to search for 1687 * @param fromIndex The index to start the search from. 1688 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1689 * -1 if the codepoint does not occur. 1690 * @stable ICU 2.6 1691 */ indexOf(String source, String str, int fromIndex)1692 public static int indexOf(String source, String str, int fromIndex) { 1693 int strLength = str.length(); 1694 // non-surrogate ends 1695 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1696 return source.indexOf(str, fromIndex); 1697 } 1698 1699 int result = source.indexOf(str, fromIndex); 1700 int resultEnd = result + strLength; 1701 if (result >= 0) { 1702 // check last character 1703 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1704 && isTrailSurrogate(source.charAt(resultEnd))) { 1705 return indexOf(source, str, resultEnd + 1); 1706 } 1707 // check first character which is a trail surrogate 1708 if (isTrailSurrogate(str.charAt(0)) && result > 0 1709 && isLeadSurrogate(source.charAt(result - 1))) { 1710 return indexOf(source, str, resultEnd + 1); 1711 } 1712 } 1713 return result; 1714 } 1715 1716 /** 1717 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1718 * the argument codepoint. I.e., the index returned is the largest value i such that: 1719 * UTF16.charAt(source, i) == char32 is true. 1720 * <p> 1721 * Examples:<br> 1722 * UTF16.lastIndexOf("abc", 'a') returns 0<br> 1723 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1724 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1725 * </p> 1726 * <p> 1727 * source is searched backwards starting at the last character. 1728 * </p> 1729 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1730 * characters to its fullest. 1731 * 1732 * @param source UTF16 format Unicode string that will be searched 1733 * @param char32 Codepoint to search for 1734 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1735 * does not occur. 1736 * @stable ICU 2.6 1737 */ lastIndexOf(String source, int char32)1738 public static int lastIndexOf(String source, int char32) { 1739 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1740 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1741 } 1742 // non-surrogate bmp 1743 if (char32 < LEAD_SURROGATE_MIN_VALUE 1744 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1745 return source.lastIndexOf((char) char32); 1746 } 1747 // surrogate 1748 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1749 int result = source.lastIndexOf((char) char32); 1750 if (result >= 0) { 1751 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1752 && isTrailSurrogate(source.charAt(result + 1))) { 1753 return lastIndexOf(source, char32, result - 1); 1754 } 1755 // trail surrogate 1756 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1757 return lastIndexOf(source, char32, result - 1); 1758 } 1759 } 1760 return result; 1761 } 1762 // supplementary 1763 String char32str = toString(char32); 1764 return source.lastIndexOf(char32str); 1765 } 1766 1767 /** 1768 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1769 * the argument string str. This method is implemented based on codepoints, hence a "lead 1770 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1771 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1772 * character before str found at in source will not have a valid match. Vice versa for lead 1773 * surrogates that ends str. See example below. 1774 * <p> 1775 * Examples:<br> 1776 * UTF16.lastIndexOf("abc", "a") returns 0<br> 1777 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1778 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1779 * </p> 1780 * <p> 1781 * source is searched backwards starting at the last character. 1782 * </p> 1783 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1784 * characters to its fullest. 1785 * 1786 * @param source UTF16 format Unicode string that will be searched 1787 * @param str UTF16 format Unicode string to search for 1788 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1789 * does not occur. 1790 * @stable ICU 2.6 1791 */ lastIndexOf(String source, String str)1792 public static int lastIndexOf(String source, String str) { 1793 int strLength = str.length(); 1794 // non-surrogate ends 1795 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1796 return source.lastIndexOf(str); 1797 } 1798 1799 int result = source.lastIndexOf(str); 1800 if (result >= 0) { 1801 // check last character 1802 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1803 && isTrailSurrogate(source.charAt(result + strLength + 1))) { 1804 return lastIndexOf(source, str, result - 1); 1805 } 1806 // check first character which is a trail surrogate 1807 if (isTrailSurrogate(str.charAt(0)) && result > 0 1808 && isLeadSurrogate(source.charAt(result - 1))) { 1809 return lastIndexOf(source, str, result - 1); 1810 } 1811 } 1812 return result; 1813 } 1814 1815 /** 1816 * <p> 1817 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1818 * the argument codepoint, where the result is less than or equals to fromIndex. 1819 * </p> 1820 * <p> 1821 * This method is implemented based on codepoints, hence a single surrogate character will not 1822 * match a supplementary character. 1823 * </p> 1824 * <p> 1825 * source is searched backwards starting at the last character starting at the specified index. 1826 * </p> 1827 * <p> 1828 * Examples:<br> 1829 * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br> 1830 * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br> 1831 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br> 1832 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br> 1833 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1834 * </p> 1835 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1836 * characters to its fullest. 1837 * 1838 * @param source UTF16 format Unicode string that will be searched 1839 * @param char32 Codepoint to search for 1840 * @param fromIndex the index to start the search from. There is no restriction on the value of 1841 * fromIndex. If it is greater than or equal to the length of this string, it has the 1842 * same effect as if it were equal to one less than the length of this string: this 1843 * entire string may be searched. If it is negative, it has the same effect as if it 1844 * were -1: -1 is returned. 1845 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1846 * does not occur. 1847 * @stable ICU 2.6 1848 */ lastIndexOf(String source, int char32, int fromIndex)1849 public static int lastIndexOf(String source, int char32, int fromIndex) { 1850 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1851 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1852 } 1853 // non-surrogate bmp 1854 if (char32 < LEAD_SURROGATE_MIN_VALUE 1855 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1856 return source.lastIndexOf((char) char32, fromIndex); 1857 } 1858 // surrogate 1859 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1860 int result = source.lastIndexOf((char) char32, fromIndex); 1861 if (result >= 0) { 1862 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1863 && isTrailSurrogate(source.charAt(result + 1))) { 1864 return lastIndexOf(source, char32, result - 1); 1865 } 1866 // trail surrogate 1867 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1868 return lastIndexOf(source, char32, result - 1); 1869 } 1870 } 1871 return result; 1872 } 1873 // supplementary 1874 String char32str = toString(char32); 1875 return source.lastIndexOf(char32str, fromIndex); 1876 } 1877 1878 /** 1879 * <p> 1880 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1881 * the argument string str, where the result is less than or equals to fromIndex. 1882 * </p> 1883 * <p> 1884 * This method is implemented based on codepoints, hence a "lead surrogate character + trail 1885 * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate 1886 * character at index 0, a source with a leading a surrogate character before str found at in 1887 * source will not have a valid match. Vice versa for lead surrogates that ends str. 1888 * </p> 1889 * See example below. 1890 * <p> 1891 * Examples:<br> 1892 * UTF16.lastIndexOf("abc", "c", 2) returns 2<br> 1893 * UTF16.lastIndexOf("abc", "c", 1) returns -1<br> 1894 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br> 1895 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br> 1896 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br> 1897 * </p> 1898 * <p> 1899 * source is searched backwards starting at the last character. 1900 * </p> 1901 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1902 * characters to its fullest. 1903 * 1904 * @param source UTF16 format Unicode string that will be searched 1905 * @param str UTF16 format Unicode string to search for 1906 * @param fromIndex the index to start the search from. There is no restriction on the value of 1907 * fromIndex. If it is greater than or equal to the length of this string, it has the 1908 * same effect as if it were equal to one less than the length of this string: this 1909 * entire string may be searched. If it is negative, it has the same effect as if it 1910 * were -1: -1 is returned. 1911 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1912 * does not occur. 1913 * @stable ICU 2.6 1914 */ lastIndexOf(String source, String str, int fromIndex)1915 public static int lastIndexOf(String source, String str, int fromIndex) { 1916 int strLength = str.length(); 1917 // non-surrogate ends 1918 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1919 return source.lastIndexOf(str, fromIndex); 1920 } 1921 1922 int result = source.lastIndexOf(str, fromIndex); 1923 if (result >= 0) { 1924 // check last character 1925 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1926 && isTrailSurrogate(source.charAt(result + strLength))) { 1927 return lastIndexOf(source, str, result - 1); 1928 } 1929 // check first character which is a trail surrogate 1930 if (isTrailSurrogate(str.charAt(0)) && result > 0 1931 && isLeadSurrogate(source.charAt(result - 1))) { 1932 return lastIndexOf(source, str, result - 1); 1933 } 1934 } 1935 return result; 1936 } 1937 1938 /** 1939 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of 1940 * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16 1941 * format Unicode string source, then source will be returned. Otherwise, a new String object is 1942 * created that represents a codepoint sequence identical to the codepoint sequence represented 1943 * by source, except that every occurrence of oldChar32 is replaced by an occurrence of 1944 * newChar32. 1945 * <p> 1946 * Examples: <br> 1947 * UTF16.replace("mesquite in your cellar", 'e', 'o');<br> 1948 * returns "mosquito in your collar"<br> 1949 * UTF16.replace("JonL", 'q', 'x');<br> 1950 * returns "JonL" (no change)<br> 1951 * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br> 1952 * returns "Supplementary character !"<br> 1953 * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br> 1954 * returns "Supplementary character \ud800\udc00"<br> 1955 * </p> 1956 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1957 * characters to its fullest. 1958 * 1959 * @param source UTF16 format Unicode string which the codepoint replacements will be based on. 1960 * @param oldChar32 Non-zero old codepoint to be replaced. 1961 * @param newChar32 The new codepoint to replace oldChar32 1962 * @return new String derived from source by replacing every occurrence of oldChar32 with 1963 * newChar32, unless when no oldChar32 is found in source then source will be returned. 1964 * @stable ICU 2.6 1965 */ replace(String source, int oldChar32, int newChar32)1966 public static String replace(String source, int oldChar32, int newChar32) { 1967 if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) { 1968 throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint"); 1969 } 1970 if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) { 1971 throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint"); 1972 } 1973 1974 int index = indexOf(source, oldChar32); 1975 if (index == -1) { 1976 return source; 1977 } 1978 String newChar32Str = toString(newChar32); 1979 int oldChar32Size = 1; 1980 int newChar32Size = newChar32Str.length(); 1981 StringBuffer result = new StringBuffer(source); 1982 int resultIndex = index; 1983 1984 if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) { 1985 oldChar32Size = 2; 1986 } 1987 1988 while (index != -1) { 1989 int endResultIndex = resultIndex + oldChar32Size; 1990 result.replace(resultIndex, endResultIndex, newChar32Str); 1991 int lastEndIndex = index + oldChar32Size; 1992 index = indexOf(source, oldChar32, lastEndIndex); 1993 resultIndex += newChar32Size + index - lastEndIndex; 1994 } 1995 return result.toString(); 1996 } 1997 1998 /** 1999 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr 2000 * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string 2001 * source, then source will be returned. Otherwise, a new String object is created that 2002 * represents a codepoint sequence identical to the codepoint sequence represented by source, 2003 * except that every occurrence of oldStr is replaced by an occurrence of newStr. 2004 * <p> 2005 * Examples: <br> 2006 * UTF16.replace("mesquite in your cellar", "e", "o");<br> 2007 * returns "mosquito in your collar"<br> 2008 * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br> 2009 * returns "cat in your cellar"<br> 2010 * UTF16.replace("JonL", "q", "x");<br> 2011 * returns "JonL" (no change)<br> 2012 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br> 2013 * returns "Supplementary character !"<br> 2014 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br> 2015 * returns "Supplementary character \ud800\udc00"<br> 2016 * </p> 2017 * Note this method is provided as support to jdk 1.3, which does not support supplementary 2018 * characters to its fullest. 2019 * 2020 * @param source UTF16 format Unicode string which the replacements will be based on. 2021 * @param oldStr Non-zero-length string to be replaced. 2022 * @param newStr The new string to replace oldStr 2023 * @return new String derived from source by replacing every occurrence of oldStr with newStr. 2024 * When no oldStr is found in source, then source will be returned. 2025 * @stable ICU 2.6 2026 */ replace(String source, String oldStr, String newStr)2027 public static String replace(String source, String oldStr, String newStr) { 2028 int index = indexOf(source, oldStr); 2029 if (index == -1) { 2030 return source; 2031 } 2032 int oldStrSize = oldStr.length(); 2033 int newStrSize = newStr.length(); 2034 StringBuffer result = new StringBuffer(source); 2035 int resultIndex = index; 2036 2037 while (index != -1) { 2038 int endResultIndex = resultIndex + oldStrSize; 2039 result.replace(resultIndex, endResultIndex, newStr); 2040 int lastEndIndex = index + oldStrSize; 2041 index = indexOf(source, oldStr, lastEndIndex); 2042 resultIndex += newStrSize + index - lastEndIndex; 2043 } 2044 return result.toString(); 2045 } 2046 2047 /** 2048 * Reverses a UTF16 format Unicode string and replaces source's content with it. This method 2049 * will reverse surrogate characters correctly, instead of blindly reversing every character. 2050 * <p> 2051 * Examples:<br> 2052 * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br> 2053 * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS". 2054 * 2055 * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed 2056 * @return a modified source with reversed UTF16 format Unicode string. 2057 * @stable ICU 2.6 2058 */ reverse(StringBuffer source)2059 public static StringBuffer reverse(StringBuffer source) { 2060 int length = source.length(); 2061 StringBuffer result = new StringBuffer(length); 2062 for (int i = length; i-- > 0;) { 2063 char ch = source.charAt(i); 2064 if (isTrailSurrogate(ch) && i > 0) { 2065 char ch2 = source.charAt(i - 1); 2066 if (isLeadSurrogate(ch2)) { 2067 result.append(ch2); 2068 result.append(ch); 2069 --i; 2070 continue; 2071 } 2072 } 2073 result.append(ch); 2074 } 2075 return result; 2076 } 2077 2078 /** 2079 * Check if the string contains more Unicode code points than a certain number. This is more 2080 * efficient than counting all code points in the entire string and comparing that number with a 2081 * threshold. This function may not need to scan the string at all if the length is within a 2082 * certain range, and never needs to count more than 'number + 1' code points. Logically 2083 * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two 2084 * code units. 2085 * 2086 * @param source The input string. 2087 * @param number The number of code points in the string is compared against the 'number' 2088 * parameter. 2089 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2090 * @stable ICU 2.4 2091 */ hasMoreCodePointsThan(String source, int number)2092 public static boolean hasMoreCodePointsThan(String source, int number) { 2093 if (number < 0) { 2094 return true; 2095 } 2096 if (source == null) { 2097 return false; 2098 } 2099 int length = source.length(); 2100 2101 // length >= 0 known 2102 // source contains at least (length + 1) / 2 code points: <= 2 2103 // chars per cp 2104 if (((length + 1) >> 1) > number) { 2105 return true; 2106 } 2107 2108 // check if source does not even contain enough chars 2109 int maxsupplementary = length - number; 2110 if (maxsupplementary <= 0) { 2111 return false; 2112 } 2113 2114 // there are maxsupplementary = length - number more chars than 2115 // asked-for code points 2116 2117 // count code points until they exceed and also check that there are 2118 // no more than maxsupplementary supplementary code points (char pairs) 2119 int start = 0; 2120 while (true) { 2121 if (length == 0) { 2122 return false; 2123 } 2124 if (number == 0) { 2125 return true; 2126 } 2127 if (isLeadSurrogate(source.charAt(start++)) && start != length 2128 && isTrailSurrogate(source.charAt(start))) { 2129 start++; 2130 if (--maxsupplementary <= 0) { 2131 // too many pairs - too few code points 2132 return false; 2133 } 2134 } 2135 --number; 2136 } 2137 } 2138 2139 /** 2140 * Check if the sub-range of char array, from argument start to limit, contains more Unicode 2141 * code points than a certain number. This is more efficient than counting all code points in 2142 * the entire char array range and comparing that number with a threshold. This function may not 2143 * need to scan the char array at all if start and limit is within a certain range, and never 2144 * needs to count more than 'number + 1' code points. Logically equivalent to 2145 * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one 2146 * or two code units. 2147 * 2148 * @param source Array of UTF-16 chars 2149 * @param start Offset to substring in the source array for analyzing 2150 * @param limit Offset to substring in the source array for analyzing 2151 * @param number The number of code points in the string is compared against the 'number' 2152 * parameter. 2153 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2154 * @exception IndexOutOfBoundsException Thrown when limit < start 2155 * @stable ICU 2.4 2156 */ hasMoreCodePointsThan(char source[], int start, int limit, int number)2157 public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) { 2158 int length = limit - start; 2159 if (length < 0 || start < 0 || limit < 0) { 2160 throw new IndexOutOfBoundsException( 2161 "Start and limit indexes should be non-negative and start <= limit"); 2162 } 2163 if (number < 0) { 2164 return true; 2165 } 2166 if (source == null) { 2167 return false; 2168 } 2169 2170 // length >= 0 known 2171 // source contains at least (length + 1) / 2 code points: <= 2 2172 // chars per cp 2173 if (((length + 1) >> 1) > number) { 2174 return true; 2175 } 2176 2177 // check if source does not even contain enough chars 2178 int maxsupplementary = length - number; 2179 if (maxsupplementary <= 0) { 2180 return false; 2181 } 2182 2183 // there are maxsupplementary = length - number more chars than 2184 // asked-for code points 2185 2186 // count code points until they exceed and also check that there are 2187 // no more than maxsupplementary supplementary code points (char pairs) 2188 while (true) { 2189 if (length == 0) { 2190 return false; 2191 } 2192 if (number == 0) { 2193 return true; 2194 } 2195 if (isLeadSurrogate(source[start++]) && start != limit 2196 && isTrailSurrogate(source[start])) { 2197 start++; 2198 if (--maxsupplementary <= 0) { 2199 // too many pairs - too few code points 2200 return false; 2201 } 2202 } 2203 --number; 2204 } 2205 } 2206 2207 /** 2208 * Check if the string buffer contains more Unicode code points than a certain number. This is 2209 * more efficient than counting all code points in the entire string buffer and comparing that 2210 * number with a threshold. This function may not need to scan the string buffer at all if the 2211 * length is within a certain range, and never needs to count more than 'number + 1' code 2212 * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may 2213 * occupy either one or two code units. 2214 * 2215 * @param source The input string buffer. 2216 * @param number The number of code points in the string buffer is compared against the 'number' 2217 * parameter. 2218 * @return boolean value for whether the string buffer contains more Unicode code points than 2219 * 'number'. 2220 * @stable ICU 2.4 2221 */ hasMoreCodePointsThan(StringBuffer source, int number)2222 public static boolean hasMoreCodePointsThan(StringBuffer source, int number) { 2223 if (number < 0) { 2224 return true; 2225 } 2226 if (source == null) { 2227 return false; 2228 } 2229 int length = source.length(); 2230 2231 // length >= 0 known 2232 // source contains at least (length + 1) / 2 code points: <= 2 2233 // chars per cp 2234 if (((length + 1) >> 1) > number) { 2235 return true; 2236 } 2237 2238 // check if source does not even contain enough chars 2239 int maxsupplementary = length - number; 2240 if (maxsupplementary <= 0) { 2241 return false; 2242 } 2243 2244 // there are maxsupplementary = length - number more chars than 2245 // asked-for code points 2246 2247 // count code points until they exceed and also check that there are 2248 // no more than maxsupplementary supplementary code points (char pairs) 2249 int start = 0; 2250 while (true) { 2251 if (length == 0) { 2252 return false; 2253 } 2254 if (number == 0) { 2255 return true; 2256 } 2257 if (isLeadSurrogate(source.charAt(start++)) && start != length 2258 && isTrailSurrogate(source.charAt(start))) { 2259 start++; 2260 if (--maxsupplementary <= 0) { 2261 // too many pairs - too few code points 2262 return false; 2263 } 2264 } 2265 --number; 2266 } 2267 } 2268 2269 /** 2270 * Cover JDK 1.5 API. Create a String from an array of codePoints. 2271 * 2272 * @param codePoints The code array 2273 * @param offset The start of the text in the code point array 2274 * @param count The number of code points 2275 * @return a String representing the code points between offset and count 2276 * @throws IllegalArgumentException If an invalid code point is encountered 2277 * @throws IndexOutOfBoundsException If the offset or count are out of bounds. 2278 * @stable ICU 3.0 2279 */ newString(int[] codePoints, int offset, int count)2280 public static String newString(int[] codePoints, int offset, int count) { 2281 if (count < 0) { 2282 throw new IllegalArgumentException(); 2283 } 2284 char[] chars = new char[count]; 2285 int w = 0; 2286 for (int r = offset, e = offset + count; r < e; ++r) { 2287 int cp = codePoints[r]; 2288 if (cp < 0 || cp > 0x10ffff) { 2289 throw new IllegalArgumentException(); 2290 } 2291 while (true) { 2292 try { 2293 if (cp < 0x010000) { 2294 chars[w] = (char) cp; 2295 w++; 2296 } else { 2297 chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); 2298 chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); 2299 w += 2; 2300 } 2301 break; 2302 } catch (IndexOutOfBoundsException ex) { 2303 int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2) 2304 / (r - offset + 1))); 2305 char[] temp = new char[newlen]; 2306 System.arraycopy(chars, 0, temp, 0, w); 2307 chars = temp; 2308 } 2309 } 2310 } 2311 return new String(chars, 0, w); 2312 } 2313 2314 /** 2315 * <p> 2316 * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various 2317 * modes 2318 * </p> 2319 * <ul> 2320 * <li> Code point comparison or code unit comparison 2321 * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison 2322 * with special handling for character 'i'. 2323 * </ul> 2324 * <p> 2325 * The code unit or code point comparison differ only when comparing supplementary code points 2326 * (\u10000..\u10ffff) to BMP code points near the end of the BMP (i.e., 2327 * \ue000..\uffff). In code unit comparison, high BMP code points sort after 2328 * supplementary code points because they are stored as pairs of surrogates which are at 2329 * \ud800..\udfff. 2330 * </p> 2331 * 2332 * @see #FOLD_CASE_DEFAULT 2333 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2334 * @stable ICU 2.1 2335 */ 2336 public static final class StringComparator implements java.util.Comparator<String> { 2337 // public constructor ------------------------------------------------ 2338 2339 /** 2340 * Default constructor that does code unit comparison and case sensitive comparison. 2341 * 2342 * @stable ICU 2.1 2343 */ StringComparator()2344 public StringComparator() { 2345 this(false, false, FOLD_CASE_DEFAULT); 2346 } 2347 2348 /** 2349 * Constructor that does comparison based on the argument options. 2350 * 2351 * @param codepointcompare Flag to indicate true for code point comparison or false for code unit 2352 * comparison. 2353 * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison 2354 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2355 * when ignorecase is set to true. If ignorecase is false, this option is 2356 * ignored. 2357 * @see #FOLD_CASE_DEFAULT 2358 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2359 * @throws IllegalArgumentException If foldcaseoption is out of range 2360 * @stable ICU 2.4 2361 */ StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption)2362 public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) { 2363 setCodePointCompare(codepointcompare); 2364 m_ignoreCase_ = ignorecase; 2365 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2366 throw new IllegalArgumentException("Invalid fold case option"); 2367 } 2368 m_foldCase_ = foldcaseoption; 2369 } 2370 2371 // public data member ------------------------------------------------ 2372 2373 /** 2374 * Option value for case folding comparison: 2375 * 2376 * <p>Comparison is case insensitive, strings are folded using default mappings defined in 2377 * Unicode data file CaseFolding.txt, before comparison. 2378 * 2379 * @stable ICU 2.4 2380 */ 2381 public static final int FOLD_CASE_DEFAULT = 0; 2382 2383 /** 2384 * Option value for case folding: 2385 * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I 2386 * and dotless i appropriately for Turkic languages (tr, az). 2387 * 2388 * <p>Comparison is case insensitive, strings are folded using modified mappings defined in 2389 * Unicode data file CaseFolding.txt, before comparison. 2390 * 2391 * @stable ICU 2.4 2392 * @see com.ibm.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I 2393 */ 2394 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1; 2395 2396 // public methods ---------------------------------------------------- 2397 2398 // public setters ---------------------------------------------------- 2399 2400 /** 2401 * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode 2402 * is set to code unit compare 2403 * 2404 * @param flag True for code point compare, false for code unit compare 2405 * @stable ICU 2.4 2406 */ setCodePointCompare(boolean flag)2407 public void setCodePointCompare(boolean flag) { 2408 if (flag) { 2409 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER; 2410 } else { 2411 m_codePointCompare_ = 0; 2412 } 2413 } 2414 2415 /** 2416 * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise 2417 * case sensitive comparison mode if set to false. 2418 * 2419 * @param ignorecase True for case-insitive comparison, false for case sensitive comparison 2420 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2421 * when ignorecase is set to true. If ignorecase is false, this option is 2422 * ignored. 2423 * @see #FOLD_CASE_DEFAULT 2424 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2425 * @stable ICU 2.4 2426 */ setIgnoreCase(boolean ignorecase, int foldcaseoption)2427 public void setIgnoreCase(boolean ignorecase, int foldcaseoption) { 2428 m_ignoreCase_ = ignorecase; 2429 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2430 throw new IllegalArgumentException("Invalid fold case option"); 2431 } 2432 m_foldCase_ = foldcaseoption; 2433 } 2434 2435 // public getters ---------------------------------------------------- 2436 2437 /** 2438 * Checks if the comparison mode is code point compare. 2439 * 2440 * @return true for code point compare, false for code unit compare 2441 * @stable ICU 2.4 2442 */ getCodePointCompare()2443 public boolean getCodePointCompare() { 2444 return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2445 } 2446 2447 /** 2448 * Checks if Comparator is in the case insensitive mode. 2449 * 2450 * @return true if Comparator performs case insensitive comparison, false otherwise 2451 * @stable ICU 2.4 2452 */ getIgnoreCase()2453 public boolean getIgnoreCase() { 2454 return m_ignoreCase_; 2455 } 2456 2457 /** 2458 * Gets the fold case options set in Comparator to be used with case insensitive comparison. 2459 * 2460 * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I 2461 * @see #FOLD_CASE_DEFAULT 2462 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2463 * @stable ICU 2.4 2464 */ getIgnoreCaseOption()2465 public int getIgnoreCaseOption() { 2466 return m_foldCase_; 2467 } 2468 2469 // public other methods ---------------------------------------------- 2470 2471 /** 2472 * Compare two strings depending on the options selected during construction. 2473 * 2474 * @param a first source string. 2475 * @param b second source string. 2476 * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b, 2477 * a positive value is returned. 2478 * @exception ClassCastException thrown when either a or b is not a String object 2479 * @stable ICU 4.4 2480 */ 2481 @Override compare(String a, String b)2482 public int compare(String a, String b) { 2483 if (Utility.sameObjects(a, b)) { 2484 return 0; 2485 } 2486 if (a == null) { 2487 return -1; 2488 } 2489 if (b == null) { 2490 return 1; 2491 } 2492 2493 if (m_ignoreCase_) { 2494 return compareCaseInsensitive(a, b); 2495 } 2496 return compareCaseSensitive(a, b); 2497 } 2498 2499 // private data member ---------------------------------------------- 2500 2501 /** 2502 * Code unit comparison flag. True if code unit comparison is required. False if code point 2503 * comparison is required. 2504 */ 2505 private int m_codePointCompare_; 2506 2507 /** 2508 * Fold case comparison option. 2509 */ 2510 private int m_foldCase_; 2511 2512 /** 2513 * Flag indicator if ignore case is to be used during comparison 2514 */ 2515 private boolean m_ignoreCase_; 2516 2517 /** 2518 * Code point order offset for surrogate characters 2519 */ 2520 private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800; 2521 2522 // private method --------------------------------------------------- 2523 2524 /** 2525 * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life 2526 * easier. 2527 * 2528 * @param s1 2529 * first string to compare 2530 * @param s2 2531 * second string to compare 2532 * @return -1 is s1 < s2, 0 if equals, 2533 */ compareCaseInsensitive(String s1, String s2)2534 private int compareCaseInsensitive(String s1, String s2) { 2535 return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_ 2536 | Normalizer.COMPARE_IGNORE_CASE); 2537 } 2538 2539 /** 2540 * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life 2541 * easier. 2542 * 2543 * @param s1 2544 * first string to compare 2545 * @param s2 2546 * second string to compare 2547 * @return -1 is s1 < s2, 0 if equals, 2548 */ compareCaseSensitive(String s1, String s2)2549 private int compareCaseSensitive(String s1, String s2) { 2550 // compare identical prefixes - they do not need to be fixed up 2551 // limit1 = start1 + min(lenght1, length2) 2552 int length1 = s1.length(); 2553 int length2 = s2.length(); 2554 int minlength = length1; 2555 int result = 0; 2556 if (length1 < length2) { 2557 result = -1; 2558 } else if (length1 > length2) { 2559 result = 1; 2560 minlength = length2; 2561 } 2562 2563 char c1 = 0; 2564 char c2 = 0; 2565 int index = 0; 2566 for (; index < minlength; index++) { 2567 c1 = s1.charAt(index); 2568 c2 = s2.charAt(index); 2569 // check pseudo-limit 2570 if (c1 != c2) { 2571 break; 2572 } 2573 } 2574 2575 if (index == minlength) { 2576 return result; 2577 } 2578 2579 boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2580 // if both values are in or above the surrogate range, fix them up 2581 if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE 2582 && codepointcompare) { 2583 // subtract 0x2800 from BMP code points to make them smaller 2584 // than supplementary ones 2585 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1))) 2586 || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) { 2587 // part of a surrogate pair, leave >=d800 2588 } else { 2589 // BMP code point - may be surrogate code point - make 2590 // < d800 2591 c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2592 } 2593 2594 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1))) 2595 || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) { 2596 // part of a surrogate pair, leave >=d800 2597 } else { 2598 // BMP code point - may be surrogate code point - make <d800 2599 c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2600 } 2601 } 2602 2603 // now c1 and c2 are in UTF-32-compatible order 2604 return c1 - c2; 2605 } 2606 } 2607 2608 /** 2609 * Utility for getting a code point from a CharSequence that contains exactly one code point. 2610 * @return the code point IF the string is non-null and consists of a single code point. 2611 * otherwise returns -1. 2612 * @param s to test 2613 * @stable ICU 54 2614 */ getSingleCodePoint(CharSequence s)2615 public static int getSingleCodePoint(CharSequence s) { 2616 if (s == null || s.length() == 0) { 2617 return -1; 2618 } else if (s.length() == 1) { 2619 return s.charAt(0); 2620 } else if (s.length() > 2) { 2621 return -1; 2622 } 2623 2624 // at this point, len = 2 2625 int cp = Character.codePointAt(s, 0); 2626 if (cp > 0xFFFF) { // is surrogate pair 2627 return cp; 2628 } 2629 return -1; 2630 } 2631 2632 /** 2633 * Utility for comparing a code point to a string without having to create a new string. Returns the same results 2634 * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if 2635 * <pre> 2636 * sc = new StringComparator(true,false,0); 2637 * fast = UTF16.compareCodePoint(codePoint, charSequence) 2638 * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString()) 2639 * </pre> 2640 * then 2641 * <pre> 2642 * Integer.signum(fast) == Integer.signum(slower) 2643 * </pre> 2644 * @param codePoint to test 2645 * @param s to test 2646 * @return equivalent of code point comparator comparing two strings. 2647 * @stable ICU 54 2648 */ compareCodePoint(int codePoint, CharSequence s)2649 public static int compareCodePoint(int codePoint, CharSequence s) { 2650 if (s == null) { 2651 return 1; 2652 } 2653 final int strLen = s.length(); 2654 if (strLen == 0) { 2655 return 1; 2656 } 2657 int second = Character.codePointAt(s, 0); 2658 int diff = codePoint - second; 2659 if (diff != 0) { 2660 return diff; 2661 } 2662 return strLen == Character.charCount(codePoint) ? 0 : -1; 2663 } 2664 2665 // private data members ------------------------------------------------- 2666 2667 /** 2668 * Shift value for lead surrogate to form a supplementary character. 2669 */ 2670 private static final int LEAD_SURROGATE_SHIFT_ = 10; 2671 2672 /** 2673 * Mask to retrieve the significant value from a trail surrogate. 2674 */ 2675 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; 2676 2677 /** 2678 * Value that all lead surrogate starts with 2679 */ 2680 private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE 2681 - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); 2682 2683 // private methods ------------------------------------------------------ 2684 2685 /** 2686 * <p> 2687 * Converts argument code point and returns a String object representing the code point's value 2688 * in UTF16 format. 2689 * </p> 2690 * <p> 2691 * This method does not check for the validity of the codepoint, the results are not guaranteed 2692 * if a invalid codepoint is passed as argument. 2693 * </p> 2694 * <p> 2695 * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise. 2696 * </p> 2697 * 2698 * @param ch 2699 * code point 2700 * @return string representation of the code point 2701 */ toString(int ch)2702 private static String toString(int ch) { 2703 if (ch < SUPPLEMENTARY_MIN_VALUE) { 2704 return String.valueOf((char) ch); 2705 } 2706 2707 StringBuilder result = new StringBuilder(); 2708 result.append(getLeadSurrogate(ch)); 2709 result.append(getTrailSurrogate(ch)); 2710 return result.toString(); 2711 } 2712 } 2713 // eof 2714