1 /** 2 ******************************************************************************* 3 * Copyright (C) 1996-2016, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.text; 9 10 /** 11 * <p> 12 * Standalone utility class providing UTF16 character conversions and indexing conversions. 13 * </p> 14 * <p> 15 * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap, 16 * so searching for strings is a safe operation. Similarly, concatenation is always safe. 17 * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the 18 * values for start and end are on those boundaries, since they arose from operations like 19 * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>. 20 * </p> 21 * <strong>Examples:</strong> 22 * <p> 23 * The following examples illustrate use of some of these methods. 24 * 25 * <pre> 26 * // iteration forwards: Original 27 * for (int i = 0; i < s.length(); ++i) { 28 * char ch = s.charAt(i); 29 * doSomethingWith(ch); 30 * } 31 * 32 * // iteration forwards: Changes for UTF-32 33 * int ch; 34 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { 35 * ch = UTF16.charAt(s, i); 36 * doSomethingWith(ch); 37 * } 38 * 39 * // iteration backwards: Original 40 * for (int i = s.length() - 1; i >= 0; --i) { 41 * char ch = s.charAt(i); 42 * doSomethingWith(ch); 43 * } 44 * 45 * // iteration backwards: Changes for UTF-32 46 * int ch; 47 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { 48 * ch = UTF16.charAt(s, i); 49 * doSomethingWith(ch); 50 * } 51 * </pre> 52 * 53 * <strong>Notes:</strong> 54 * <ul> 55 * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code> 56 * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string. 57 * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16 58 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32 59 * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li> 60 * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a 61 * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16 62 * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>. 63 * </li> 64 * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out 65 * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates 66 * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to 67 * check for validity if desired. </li> 68 * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then 69 * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It 70 * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4, 71 * 5.5). </li> 72 * <li> <strong>Optimization:</strong> The method implementations may need optimization if the 73 * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small 74 * percentage of all the text in the world, the singleton case should always be optimized for. </li> 75 * </ul> 76 * 77 * @author Mark Davis, with help from Markus Scherer 78 * @stable ICU 2.1 79 */ 80 81 public final class UTF16 { 82 // public variables --------------------------------------------------- 83 84 /** 85 * Value returned in {@link #bounds(String, int) bounds()}. 86 * These values are chosen specifically so that it actually represents the position of the 87 * character [offset16 - (value >> 2), offset16 + (value & 3)] 88 * 89 * @stable ICU 2.1 90 */ 91 public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2, 92 TRAIL_SURROGATE_BOUNDARY = 5; 93 94 /** 95 * The lowest Unicode code point value. 96 * 97 * @stable ICU 2.1 98 */ 99 public static final int CODEPOINT_MIN_VALUE = 0; 100 101 /** 102 * The highest Unicode code point value (scalar value) according to the Unicode Standard. 103 * 104 * @stable ICU 2.1 105 */ 106 public static final int CODEPOINT_MAX_VALUE = 0x10ffff; 107 108 /** 109 * The minimum value for Supplementary code points 110 * 111 * @stable ICU 2.1 112 */ 113 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 114 115 /** 116 * Lead surrogate minimum value 117 * 118 * @stable ICU 2.1 119 */ 120 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 121 122 /** 123 * Trail surrogate minimum value 124 * 125 * @stable ICU 2.1 126 */ 127 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 128 129 /** 130 * Lead surrogate maximum value 131 * 132 * @stable ICU 2.1 133 */ 134 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 135 136 /** 137 * Trail surrogate maximum value 138 * 139 * @stable ICU 2.1 140 */ 141 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 142 143 /** 144 * Surrogate minimum value 145 * 146 * @stable ICU 2.1 147 */ 148 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; 149 150 /** 151 * Maximum surrogate value 152 * 153 * @stable ICU 2.1 154 */ 155 public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE; 156 157 /** 158 * Lead surrogate bitmask 159 */ 160 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; 161 162 /** 163 * Trail surrogate bitmask 164 */ 165 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; 166 167 /** 168 * Surrogate bitmask 169 */ 170 private static final int SURROGATE_BITMASK = 0xFFFFF800; 171 172 /** 173 * Lead surrogate bits 174 */ 175 private static final int LEAD_SURROGATE_BITS = 0xD800; 176 177 /** 178 * Trail surrogate bits 179 */ 180 private static final int TRAIL_SURROGATE_BITS = 0xDC00; 181 182 /** 183 * Surrogate bits 184 */ 185 private static final int SURROGATE_BITS = 0xD800; 186 187 // constructor -------------------------------------------------------- 188 189 // /CLOVER:OFF 190 /** 191 * Prevent instance from being created. 192 */ UTF16()193 private UTF16() { 194 } 195 196 // /CLOVER:ON 197 // public method ------------------------------------------------------ 198 199 /** 200 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 201 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 202 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 203 * UCharacter.isLegal()</a></code> 204 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 205 * character will be returned. If a complete supplementary character is not found the incomplete 206 * character will be returned 207 * 208 * @param source Array of UTF-16 chars 209 * @param offset16 UTF-16 offset to the start of the character. 210 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 211 * of that codepoint are the same as in <code>bounds32()</code>. 212 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 213 * @stable ICU 2.1 214 */ charAt(String source, int offset16)215 public static int charAt(String source, int offset16) { 216 char single = source.charAt(offset16); 217 if (single < LEAD_SURROGATE_MIN_VALUE) { 218 return single; 219 } 220 return _charAt(source, offset16, single); 221 } 222 _charAt(String source, int offset16, char single)223 private static int _charAt(String source, int offset16, char single) { 224 if (single > TRAIL_SURROGATE_MAX_VALUE) { 225 return single; 226 } 227 228 // Convert the UTF-16 surrogate pair if necessary. 229 // For simplicity in usage, and because the frequency of pairs is 230 // low, look both directions. 231 232 if (single <= LEAD_SURROGATE_MAX_VALUE) { 233 ++offset16; 234 if (source.length() != offset16) { 235 char trail = source.charAt(offset16); 236 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { 237 return Character.toCodePoint(single, trail); 238 } 239 } 240 } else { 241 --offset16; 242 if (offset16 >= 0) { 243 // single is a trail surrogate so 244 char lead = source.charAt(offset16); 245 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { 246 return Character.toCodePoint(lead, single); 247 } 248 } 249 } 250 return single; // return unmatched surrogate 251 } 252 253 /** 254 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 255 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 256 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 257 * UCharacter.isLegal()</a></code> 258 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 259 * character will be returned. If a complete supplementary character is not found the incomplete 260 * character will be returned 261 * 262 * @param source Array of UTF-16 chars 263 * @param offset16 UTF-16 offset to the start of the character. 264 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 265 * of that codepoint are the same as in <code>bounds32()</code>. 266 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 267 * @stable ICU 2.1 268 */ charAt(CharSequence source, int offset16)269 public static int charAt(CharSequence source, int offset16) { 270 char single = source.charAt(offset16); 271 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { 272 return single; 273 } 274 return _charAt(source, offset16, single); 275 } 276 _charAt(CharSequence source, int offset16, char single)277 private static int _charAt(CharSequence source, int offset16, char single) { 278 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { 279 return single; 280 } 281 282 // Convert the UTF-16 surrogate pair if necessary. 283 // For simplicity in usage, and because the frequency of pairs is 284 // low, look both directions. 285 286 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 287 ++offset16; 288 if (source.length() != offset16) { 289 char trail = source.charAt(offset16); 290 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE 291 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { 292 return Character.toCodePoint(single, trail); 293 } 294 } 295 } else { 296 --offset16; 297 if (offset16 >= 0) { 298 // single is a trail surrogate so 299 char lead = source.charAt(offset16); 300 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE 301 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 302 return Character.toCodePoint(lead, single); 303 } 304 } 305 } 306 return single; // return unmatched surrogate 307 } 308 309 /** 310 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 311 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 312 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 313 * </a></code> 314 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 315 * character will be returned. If a complete supplementary character is not found the incomplete 316 * character will be returned 317 * 318 * @param source UTF-16 chars string buffer 319 * @param offset16 UTF-16 offset to the start of the character. 320 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 321 * of that codepoint are the same as in <code>bounds32()</code>. 322 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 323 * @stable ICU 2.1 324 */ charAt(StringBuffer source, int offset16)325 public static int charAt(StringBuffer source, int offset16) { 326 if (offset16 < 0 || offset16 >= source.length()) { 327 throw new StringIndexOutOfBoundsException(offset16); 328 } 329 330 char single = source.charAt(offset16); 331 if (!isSurrogate(single)) { 332 return single; 333 } 334 335 // Convert the UTF-16 surrogate pair if necessary. 336 // For simplicity in usage, and because the frequency of pairs is 337 // low, look both directions. 338 339 if (single <= LEAD_SURROGATE_MAX_VALUE) { 340 ++offset16; 341 if (source.length() != offset16) { 342 char trail = source.charAt(offset16); 343 if (isTrailSurrogate(trail)) 344 return Character.toCodePoint(single, trail); 345 } 346 } else { 347 --offset16; 348 if (offset16 >= 0) { 349 // single is a trail surrogate so 350 char lead = source.charAt(offset16); 351 if (isLeadSurrogate(lead)) { 352 return Character.toCodePoint(lead, single); 353 } 354 } 355 } 356 return single; // return unmatched surrogate 357 } 358 359 /** 360 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards 361 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 362 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 363 * </a></code> 364 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 365 * character will be returned. If a complete supplementary character is not found the incomplete 366 * character will be returned 367 * 368 * @param source Array of UTF-16 chars 369 * @param start Offset to substring in the source array for analyzing 370 * @param limit Offset to substring in the source array for analyzing 371 * @param offset16 UTF-16 offset relative to start 372 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 373 * of that codepoint are the same as in <code>bounds32()</code>. 374 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit. 375 * @stable ICU 2.1 376 */ charAt(char source[], int start, int limit, int offset16)377 public static int charAt(char source[], int start, int limit, int offset16) { 378 offset16 += start; 379 if (offset16 < start || offset16 >= limit) { 380 throw new ArrayIndexOutOfBoundsException(offset16); 381 } 382 383 char single = source[offset16]; 384 if (!isSurrogate(single)) { 385 return single; 386 } 387 388 // Convert the UTF-16 surrogate pair if necessary. 389 // For simplicity in usage, and because the frequency of pairs is 390 // low, look both directions. 391 if (single <= LEAD_SURROGATE_MAX_VALUE) { 392 offset16++; 393 if (offset16 >= limit) { 394 return single; 395 } 396 char trail = source[offset16]; 397 if (isTrailSurrogate(trail)) { 398 return Character.toCodePoint(single, trail); 399 } 400 } else { // isTrailSurrogate(single), so 401 if (offset16 == start) { 402 return single; 403 } 404 offset16--; 405 char lead = source[offset16]; 406 if (isLeadSurrogate(lead)) 407 return Character.toCodePoint(lead, single); 408 } 409 return single; // return unmatched surrogate 410 } 411 412 /** 413 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 414 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 415 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 416 * </a></code> 417 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 418 * character will be returned. If a complete supplementary character is not found the incomplete 419 * character will be returned 420 * 421 * @param source UTF-16 chars string buffer 422 * @param offset16 UTF-16 offset to the start of the character. 423 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 424 * of that codepoint are the same as in <code>bounds32()</code>. 425 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 426 * @stable ICU 2.1 427 */ charAt(Replaceable source, int offset16)428 public static int charAt(Replaceable source, int offset16) { 429 if (offset16 < 0 || offset16 >= source.length()) { 430 throw new StringIndexOutOfBoundsException(offset16); 431 } 432 433 char single = source.charAt(offset16); 434 if (!isSurrogate(single)) { 435 return single; 436 } 437 438 // Convert the UTF-16 surrogate pair if necessary. 439 // For simplicity in usage, and because the frequency of pairs is 440 // low, look both directions. 441 442 if (single <= LEAD_SURROGATE_MAX_VALUE) { 443 ++offset16; 444 if (source.length() != offset16) { 445 char trail = source.charAt(offset16); 446 if (isTrailSurrogate(trail)) 447 return Character.toCodePoint(single, trail); 448 } 449 } else { 450 --offset16; 451 if (offset16 >= 0) { 452 // single is a trail surrogate so 453 char lead = source.charAt(offset16); 454 if (isLeadSurrogate(lead)) { 455 return Character.toCodePoint(lead, single); 456 } 457 } 458 } 459 return single; // return unmatched surrogate 460 } 461 462 /** 463 * Determines how many chars this char32 requires. If a validity check is required, use <code> 464 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 465 * on char32 before calling. 466 * 467 * @param char32 The input codepoint. 468 * @return 2 if is in supplementary space, otherwise 1. 469 * @stable ICU 2.1 470 */ getCharCount(int char32)471 public static int getCharCount(int char32) { 472 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 473 return 1; 474 } 475 return 2; 476 } 477 478 /** 479 * Returns the type of the boundaries around the char at offset16. Used for random access. 480 * 481 * @param source Text to analyse 482 * @param offset16 UTF-16 offset 483 * @return 484 * <ul> 485 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1] 486 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 487 * are [offset16, offset16 + 2] 488 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 489 * bounds are [offset16 - 1, offset16 + 1] 490 * </ul> 491 * For bit-twiddlers, the return values for these are chosen so that the boundaries 492 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 493 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 494 * @stable ICU 2.1 495 */ bounds(String source, int offset16)496 public static int bounds(String source, int offset16) { 497 char ch = source.charAt(offset16); 498 if (isSurrogate(ch)) { 499 if (isLeadSurrogate(ch)) { 500 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 501 return LEAD_SURROGATE_BOUNDARY; 502 } 503 } else { 504 // isTrailSurrogate(ch), so 505 --offset16; 506 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 507 return TRAIL_SURROGATE_BOUNDARY; 508 } 509 } 510 } 511 return SINGLE_CHAR_BOUNDARY; 512 } 513 514 /** 515 * Returns the type of the boundaries around the char at offset16. Used for random access. 516 * 517 * @param source String buffer to analyse 518 * @param offset16 UTF16 offset 519 * @return 520 * <ul> 521 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1] 522 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 523 * are [offset16, offset16 + 2] 524 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 525 * bounds are [offset16 - 1, offset16 + 1] 526 * </ul> 527 * For bit-twiddlers, the return values for these are chosen so that the boundaries 528 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 529 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 530 * @stable ICU 2.1 531 */ bounds(StringBuffer source, int offset16)532 public static int bounds(StringBuffer source, int offset16) { 533 char ch = source.charAt(offset16); 534 if (isSurrogate(ch)) { 535 if (isLeadSurrogate(ch)) { 536 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 537 return LEAD_SURROGATE_BOUNDARY; 538 } 539 } else { 540 // isTrailSurrogate(ch), so 541 --offset16; 542 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 543 return TRAIL_SURROGATE_BOUNDARY; 544 } 545 } 546 } 547 return SINGLE_CHAR_BOUNDARY; 548 } 549 550 /** 551 * Returns the type of the boundaries around the char at offset16. Used for random access. Note 552 * that the boundaries are determined with respect to the subarray, hence the char array 553 * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1. 554 * 555 * @param source Char array to analyse 556 * @param start Offset to substring in the source array for analyzing 557 * @param limit Offset to substring in the source array for analyzing 558 * @param offset16 UTF16 offset relative to start 559 * @return 560 * <ul> 561 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are 562 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 563 * are [offset16, offset16 + 2] 564 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 565 * bounds are [offset16 - 1, offset16 + 1] 566 * </ul> 567 * For bit-twiddlers, the boundary values for these are chosen so that the boundaries 568 * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)]. 569 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 570 * @stable ICU 2.1 571 */ bounds(char source[], int start, int limit, int offset16)572 public static int bounds(char source[], int start, int limit, int offset16) { 573 offset16 += start; 574 if (offset16 < start || offset16 >= limit) { 575 throw new ArrayIndexOutOfBoundsException(offset16); 576 } 577 char ch = source[offset16]; 578 if (isSurrogate(ch)) { 579 if (isLeadSurrogate(ch)) { 580 ++offset16; 581 if (offset16 < limit && isTrailSurrogate(source[offset16])) { 582 return LEAD_SURROGATE_BOUNDARY; 583 } 584 } else { // isTrailSurrogate(ch), so 585 --offset16; 586 if (offset16 >= start && isLeadSurrogate(source[offset16])) { 587 return TRAIL_SURROGATE_BOUNDARY; 588 } 589 } 590 } 591 return SINGLE_CHAR_BOUNDARY; 592 } 593 594 /** 595 * Determines whether the code value is a surrogate. 596 * 597 * @param char16 The input character. 598 * @return true If the input character is a surrogate. 599 * @stable ICU 2.1 600 */ isSurrogate(char char16)601 public static boolean isSurrogate(char char16) { 602 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; 603 } 604 605 /** 606 * Determines whether the character is a trail surrogate. 607 * 608 * @param char16 The input character. 609 * @return true If the input character is a trail surrogate. 610 * @stable ICU 2.1 611 */ isTrailSurrogate(char char16)612 public static boolean isTrailSurrogate(char char16) { 613 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; 614 } 615 616 /** 617 * Determines whether the character is a lead surrogate. 618 * 619 * @param char16 The input character. 620 * @return true If the input character is a lead surrogate 621 * @stable ICU 2.1 622 */ isLeadSurrogate(char char16)623 public static boolean isLeadSurrogate(char char16) { 624 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; 625 } 626 627 /** 628 * Returns the lead surrogate. If a validity check is required, use 629 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 630 * before calling. 631 * 632 * @param char32 The input character. 633 * @return lead surrogate if the getCharCount(ch) is 2; <br> 634 * and 0 otherwise (note: 0 is not a valid lead surrogate). 635 * @stable ICU 2.1 636 */ getLeadSurrogate(int char32)637 public static char getLeadSurrogate(int char32) { 638 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 639 return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_)); 640 } 641 return 0; 642 } 643 644 /** 645 * Returns the trail surrogate. If a validity check is required, use 646 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 647 * before calling. 648 * 649 * @param char32 The input character. 650 * @return the trail surrogate if the getCharCount(ch) is 2; <br> 651 * otherwise the character itself 652 * @stable ICU 2.1 653 */ getTrailSurrogate(int char32)654 public static char getTrailSurrogate(int char32) { 655 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 656 return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_)); 657 } 658 return (char) char32; 659 } 660 661 /** 662 * Convenience method corresponding to String.valueOf(char). Returns a one or two char string 663 * containing the UTF-32 value in UTF16 format. If a validity check is required, use 664 * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before calling. 665 * 666 * @param char32 The input character. 667 * @return string value of char32 in UTF16 format 668 * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint. 669 * @stable ICU 2.1 670 */ valueOf(int char32)671 public static String valueOf(int char32) { 672 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 673 throw new IllegalArgumentException("Illegal codepoint"); 674 } 675 return toString(char32); 676 } 677 678 /** 679 * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or 680 * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate 681 * character, the whole supplementary codepoint will be returned. If a validity check is 682 * required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the 683 * codepoint at offset16 before calling. The result returned will be a newly created String 684 * obtained by calling source.substring(..) with the appropriate indexes. 685 * 686 * @param source The input string. 687 * @param offset16 The UTF16 index to the codepoint in source 688 * @return string value of char32 in UTF16 format 689 * @stable ICU 2.1 690 */ valueOf(String source, int offset16)691 public static String valueOf(String source, int offset16) { 692 switch (bounds(source, offset16)) { 693 case LEAD_SURROGATE_BOUNDARY: 694 return source.substring(offset16, offset16 + 2); 695 case TRAIL_SURROGATE_BOUNDARY: 696 return source.substring(offset16 - 1, offset16 + 1); 697 default: 698 return source.substring(offset16, offset16 + 1); 699 } 700 } 701 702 /** 703 * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a 704 * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a 705 * surrogate character, the whole supplementary codepoint will be returned. If a validity check 706 * is required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on 707 * the codepoint at offset16 before calling. The result returned will be a newly created String 708 * obtained by calling source.substring(..) with the appropriate indexes. 709 * 710 * @param source The input string buffer. 711 * @param offset16 The UTF16 index to the codepoint in source 712 * @return string value of char32 in UTF16 format 713 * @stable ICU 2.1 714 */ valueOf(StringBuffer source, int offset16)715 public static String valueOf(StringBuffer source, int offset16) { 716 switch (bounds(source, offset16)) { 717 case LEAD_SURROGATE_BOUNDARY: 718 return source.substring(offset16, offset16 + 2); 719 case TRAIL_SURROGATE_BOUNDARY: 720 return source.substring(offset16 - 1, offset16 + 1); 721 default: 722 return source.substring(offset16, offset16 + 1); 723 } 724 } 725 726 /** 727 * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16 728 * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be 729 * returned, except when either the leading or trailing surrogate character lies out of the 730 * specified subarray. In the latter case, only the surrogate character within bounds will be 731 * returned. If a validity check is required, use 732 * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the codepoint at 733 * offset16 before calling. The result returned will be a newly created String containing the 734 * relevant characters. 735 * 736 * @param source The input char array. 737 * @param start Start index of the subarray 738 * @param limit End index of the subarray 739 * @param offset16 The UTF16 index to the codepoint in source relative to start 740 * @return string value of char32 in UTF16 format 741 * @stable ICU 2.1 742 */ valueOf(char source[], int start, int limit, int offset16)743 public static String valueOf(char source[], int start, int limit, int offset16) { 744 switch (bounds(source, start, limit, offset16)) { 745 case LEAD_SURROGATE_BOUNDARY: 746 return new String(source, start + offset16, 2); 747 case TRAIL_SURROGATE_BOUNDARY: 748 return new String(source, start + offset16 - 1, 2); 749 } 750 return new String(source, start + offset16, 1); 751 } 752 753 /** 754 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 755 * the {@link UTF16 class description} for notes on roundtripping. 756 * 757 * @param source The UTF-16 string 758 * @param offset32 UTF-32 offset 759 * @return UTF-16 offset 760 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 761 * @stable ICU 2.1 762 */ findOffsetFromCodePoint(String source, int offset32)763 public static int findOffsetFromCodePoint(String source, int offset32) { 764 char ch; 765 int size = source.length(), result = 0, count = offset32; 766 if (offset32 < 0 || offset32 > size) { 767 throw new StringIndexOutOfBoundsException(offset32); 768 } 769 while (result < size && count > 0) { 770 ch = source.charAt(result); 771 if (isLeadSurrogate(ch) && ((result + 1) < size) 772 && isTrailSurrogate(source.charAt(result + 1))) { 773 result++; 774 } 775 776 count--; 777 result++; 778 } 779 if (count != 0) { 780 throw new StringIndexOutOfBoundsException(offset32); 781 } 782 return result; 783 } 784 785 /** 786 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 787 * the {@link UTF16 class description} for notes on roundtripping. 788 * 789 * @param source The UTF-16 string buffer 790 * @param offset32 UTF-32 offset 791 * @return UTF-16 offset 792 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 793 * @stable ICU 2.1 794 */ findOffsetFromCodePoint(StringBuffer source, int offset32)795 public static int findOffsetFromCodePoint(StringBuffer source, int offset32) { 796 char ch; 797 int size = source.length(), result = 0, count = offset32; 798 if (offset32 < 0 || offset32 > size) { 799 throw new StringIndexOutOfBoundsException(offset32); 800 } 801 while (result < size && count > 0) { 802 ch = source.charAt(result); 803 if (isLeadSurrogate(ch) && ((result + 1) < size) 804 && isTrailSurrogate(source.charAt(result + 1))) { 805 result++; 806 } 807 808 count--; 809 result++; 810 } 811 if (count != 0) { 812 throw new StringIndexOutOfBoundsException(offset32); 813 } 814 return result; 815 } 816 817 /** 818 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 819 * the {@link UTF16 class description} for notes on roundtripping. 820 * 821 * @param source The UTF-16 char array whose substring is to be analysed 822 * @param start Offset of the substring to be analysed 823 * @param limit Offset of the substring to be analysed 824 * @param offset32 UTF-32 offset relative to start 825 * @return UTF-16 offset relative to start 826 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 827 * @stable ICU 2.1 828 */ findOffsetFromCodePoint(char source[], int start, int limit, int offset32)829 public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) { 830 char ch; 831 int result = start, count = offset32; 832 if (offset32 > limit - start) { 833 throw new ArrayIndexOutOfBoundsException(offset32); 834 } 835 while (result < limit && count > 0) { 836 ch = source[result]; 837 if (isLeadSurrogate(ch) && ((result + 1) < limit) 838 && isTrailSurrogate(source[result + 1])) { 839 result++; 840 } 841 842 count--; 843 result++; 844 } 845 if (count != 0) { 846 throw new ArrayIndexOutOfBoundsException(offset32); 847 } 848 return result - start; 849 } 850 851 /** 852 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given 853 * UTF-16 offset. Used for random access. See the {@link UTF16 class description} for 854 * notes on roundtripping.<br> 855 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 856 * of the <strong>lead</strong> of the pair is returned. </i> 857 * <p> 858 * To find the UTF-32 length of a string, use: 859 * 860 * <pre> 861 * len32 = countCodePoint(source, source.length()); 862 * </pre> 863 * 864 * @param source Text to analyse 865 * @param offset16 UTF-16 offset < source text length. 866 * @return UTF-32 offset 867 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 868 * @stable ICU 2.1 869 */ findCodePointOffset(String source, int offset16)870 public static int findCodePointOffset(String source, int offset16) { 871 if (offset16 < 0 || offset16 > source.length()) { 872 throw new StringIndexOutOfBoundsException(offset16); 873 } 874 875 int result = 0; 876 char ch; 877 boolean hadLeadSurrogate = false; 878 879 for (int i = 0; i < offset16; ++i) { 880 ch = source.charAt(i); 881 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 882 hadLeadSurrogate = false; // count valid trail as zero 883 } else { 884 hadLeadSurrogate = isLeadSurrogate(ch); 885 ++result; // count others as 1 886 } 887 } 888 889 if (offset16 == source.length()) { 890 return result; 891 } 892 893 // end of source being the less significant surrogate character 894 // shift result back to the start of the supplementary character 895 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 896 result--; 897 } 898 899 return result; 900 } 901 902 /** 903 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 904 * offset. Used for random access. See the {@link UTF16 class description} for notes on 905 * roundtripping.<br> 906 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 907 * of the <strong>lead</strong> of the pair is returned. </i> 908 * <p> 909 * To find the UTF-32 length of a string, use: 910 * 911 * <pre> 912 * len32 = countCodePoint(source); 913 * </pre> 914 * 915 * @param source Text to analyse 916 * @param offset16 UTF-16 offset < source text length. 917 * @return UTF-32 offset 918 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 919 * @stable ICU 2.1 920 */ findCodePointOffset(StringBuffer source, int offset16)921 public static int findCodePointOffset(StringBuffer source, int offset16) { 922 if (offset16 < 0 || offset16 > source.length()) { 923 throw new StringIndexOutOfBoundsException(offset16); 924 } 925 926 int result = 0; 927 char ch; 928 boolean hadLeadSurrogate = false; 929 930 for (int i = 0; i < offset16; ++i) { 931 ch = source.charAt(i); 932 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 933 hadLeadSurrogate = false; // count valid trail as zero 934 } else { 935 hadLeadSurrogate = isLeadSurrogate(ch); 936 ++result; // count others as 1 937 } 938 } 939 940 if (offset16 == source.length()) { 941 return result; 942 } 943 944 // end of source being the less significant surrogate character 945 // shift result back to the start of the supplementary character 946 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 947 result--; 948 } 949 950 return result; 951 } 952 953 /** 954 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 955 * offset. Used for random access. See the {@link UTF16 class description} for notes on 956 * roundtripping.<br> 957 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 958 * of the <strong>lead</strong> of the pair is returned. </i> 959 * <p> 960 * To find the UTF-32 length of a substring, use: 961 * 962 * <pre> 963 * len32 = countCodePoint(source, start, limit); 964 * </pre> 965 * 966 * @param source Text to analyse 967 * @param start Offset of the substring 968 * @param limit Offset of the substring 969 * @param offset16 UTF-16 relative to start 970 * @return UTF-32 offset relative to start 971 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 972 * @stable ICU 2.1 973 */ findCodePointOffset(char source[], int start, int limit, int offset16)974 public static int findCodePointOffset(char source[], int start, int limit, int offset16) { 975 offset16 += start; 976 if (offset16 > limit) { 977 throw new StringIndexOutOfBoundsException(offset16); 978 } 979 980 int result = 0; 981 char ch; 982 boolean hadLeadSurrogate = false; 983 984 for (int i = start; i < offset16; ++i) { 985 ch = source[i]; 986 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 987 hadLeadSurrogate = false; // count valid trail as zero 988 } else { 989 hadLeadSurrogate = isLeadSurrogate(ch); 990 ++result; // count others as 1 991 } 992 } 993 994 if (offset16 == limit) { 995 return result; 996 } 997 998 // end of source being the less significant surrogate character 999 // shift result back to the start of the supplementary character 1000 if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) { 1001 result--; 1002 } 1003 1004 return result; 1005 } 1006 1007 /** 1008 * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required, 1009 * use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before 1010 * calling. 1011 * 1012 * @param target The buffer to append to 1013 * @param char32 Value to append. 1014 * @return the updated StringBuffer 1015 * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints 1016 * @stable ICU 2.1 1017 */ append(StringBuffer target, int char32)1018 public static StringBuffer append(StringBuffer target, int char32) { 1019 // Check for irregular values 1020 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1021 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); 1022 } 1023 1024 // Write the UTF-16 values 1025 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 1026 target.append(getLeadSurrogate(char32)); 1027 target.append(getTrailSurrogate(char32)); 1028 } else { 1029 target.append((char) char32); 1030 } 1031 return target; 1032 } 1033 1034 /** 1035 * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a 1036 * convenience. 1037 * 1038 * @param target The buffer to append to 1039 * @param cp The code point to append 1040 * @return the updated StringBuffer 1041 * @throws IllegalArgumentException If cp is not a valid code point 1042 * @stable ICU 3.0 1043 */ appendCodePoint(StringBuffer target, int cp)1044 public static StringBuffer appendCodePoint(StringBuffer target, int cp) { 1045 return append(target, cp); 1046 } 1047 1048 /** 1049 * Adds a codepoint to offset16 position of the argument char array. 1050 * 1051 * @param target Char array to be append with the new code point 1052 * @param limit UTF16 offset which the codepoint will be appended. 1053 * @param char32 Code point to be appended 1054 * @return offset after char32 in the array. 1055 * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not 1056 * lie within the range of the Unicode codepoints. 1057 * @stable ICU 2.1 1058 */ append(char[] target, int limit, int char32)1059 public static int append(char[] target, int limit, int char32) { 1060 // Check for irregular values 1061 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1062 throw new IllegalArgumentException("Illegal codepoint"); 1063 } 1064 // Write the UTF-16 values 1065 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 1066 target[limit++] = getLeadSurrogate(char32); 1067 target[limit++] = getTrailSurrogate(char32); 1068 } else { 1069 target[limit++] = (char) char32; 1070 } 1071 return limit; 1072 } 1073 1074 /** 1075 * Number of codepoints in a UTF16 String 1076 * 1077 * @param source UTF16 string 1078 * @return number of codepoint in string 1079 * @stable ICU 2.1 1080 */ countCodePoint(String source)1081 public static int countCodePoint(String source) { 1082 if (source == null || source.length() == 0) { 1083 return 0; 1084 } 1085 return findCodePointOffset(source, source.length()); 1086 } 1087 1088 /** 1089 * Number of codepoints in a UTF16 String buffer 1090 * 1091 * @param source UTF16 string buffer 1092 * @return number of codepoint in string 1093 * @stable ICU 2.1 1094 */ countCodePoint(StringBuffer source)1095 public static int countCodePoint(StringBuffer source) { 1096 if (source == null || source.length() == 0) { 1097 return 0; 1098 } 1099 return findCodePointOffset(source, source.length()); 1100 } 1101 1102 /** 1103 * Number of codepoints in a UTF16 char array substring 1104 * 1105 * @param source UTF16 char array 1106 * @param start Offset of the substring 1107 * @param limit Offset of the substring 1108 * @return number of codepoint in the substring 1109 * @exception IndexOutOfBoundsException If start and limit are not valid. 1110 * @stable ICU 2.1 1111 */ countCodePoint(char source[], int start, int limit)1112 public static int countCodePoint(char source[], int start, int limit) { 1113 if (source == null || source.length == 0) { 1114 return 0; 1115 } 1116 return findCodePointOffset(source, start, limit, limit - start); 1117 } 1118 1119 /** 1120 * Set a code point into a UTF16 position. Adjusts target according if we are replacing a 1121 * non-supplementary codepoint with a supplementary and vice versa. 1122 * 1123 * @param target Stringbuffer 1124 * @param offset16 UTF16 position to insert into 1125 * @param char32 Code point 1126 * @stable ICU 2.1 1127 */ setCharAt(StringBuffer target, int offset16, int char32)1128 public static void setCharAt(StringBuffer target, int offset16, int char32) { 1129 int count = 1; 1130 char single = target.charAt(offset16); 1131 1132 if (isSurrogate(single)) { 1133 // pairs of the surrogate with offset16 at the lead char found 1134 if (isLeadSurrogate(single) && (target.length() > offset16 + 1) 1135 && isTrailSurrogate(target.charAt(offset16 + 1))) { 1136 count++; 1137 } else { 1138 // pairs of the surrogate with offset16 at the trail char 1139 // found 1140 if (isTrailSurrogate(single) && (offset16 > 0) 1141 && isLeadSurrogate(target.charAt(offset16 - 1))) { 1142 offset16--; 1143 count++; 1144 } 1145 } 1146 } 1147 target.replace(offset16, offset16 + count, valueOf(char32)); 1148 } 1149 1150 /** 1151 * Set a code point into a UTF16 position in a char array. Adjusts target according if we are 1152 * replacing a non-supplementary codepoint with a supplementary and vice versa. 1153 * 1154 * @param target char array 1155 * @param limit numbers of valid chars in target, different from target.length. limit counts the 1156 * number of chars in target that represents a string, not the size of array target. 1157 * @param offset16 UTF16 position to insert into 1158 * @param char32 code point 1159 * @return new number of chars in target that represents a string 1160 * @exception IndexOutOfBoundsException if offset16 is out of range 1161 * @stable ICU 2.1 1162 */ setCharAt(char target[], int limit, int offset16, int char32)1163 public static int setCharAt(char target[], int limit, int offset16, int char32) { 1164 if (offset16 >= limit) { 1165 throw new ArrayIndexOutOfBoundsException(offset16); 1166 } 1167 int count = 1; 1168 char single = target[offset16]; 1169 1170 if (isSurrogate(single)) { 1171 // pairs of the surrogate with offset16 at the lead char found 1172 if (isLeadSurrogate(single) && (target.length > offset16 + 1) 1173 && isTrailSurrogate(target[offset16 + 1])) { 1174 count++; 1175 } else { 1176 // pairs of the surrogate with offset16 at the trail char 1177 // found 1178 if (isTrailSurrogate(single) && (offset16 > 0) 1179 && isLeadSurrogate(target[offset16 - 1])) { 1180 offset16--; 1181 count++; 1182 } 1183 } 1184 } 1185 1186 String str = valueOf(char32); 1187 int result = limit; 1188 int strlength = str.length(); 1189 target[offset16] = str.charAt(0); 1190 if (count == strlength) { 1191 if (count == 2) { 1192 target[offset16 + 1] = str.charAt(1); 1193 } 1194 } else { 1195 // this is not exact match in space, we'll have to do some 1196 // shifting 1197 System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit 1198 - (offset16 + count)); 1199 if (count < strlength) { 1200 // char32 is a supplementary character trying to squeeze into 1201 // a non-supplementary space 1202 target[offset16 + 1] = str.charAt(1); 1203 result++; 1204 if (result < target.length) { 1205 target[result] = 0; 1206 } 1207 } else { 1208 // char32 is a non-supplementary character trying to fill 1209 // into a supplementary space 1210 result--; 1211 target[result] = 0; 1212 } 1213 } 1214 return result; 1215 } 1216 1217 /** 1218 * Shifts offset16 by the argument number of codepoints 1219 * 1220 * @param source string 1221 * @param offset16 UTF16 position to shift 1222 * @param shift32 number of codepoints to shift 1223 * @return new shifted offset16 1224 * @exception IndexOutOfBoundsException if the new offset16 is out of bounds. 1225 * @stable ICU 2.1 1226 */ moveCodePointOffset(String source, int offset16, int shift32)1227 public static int moveCodePointOffset(String source, int offset16, int shift32) { 1228 int result = offset16; 1229 int size = source.length(); 1230 int count; 1231 char ch; 1232 if (offset16 < 0 || offset16 > size) { 1233 throw new StringIndexOutOfBoundsException(offset16); 1234 } 1235 if (shift32 > 0) { 1236 if (shift32 + offset16 > size) { 1237 throw new StringIndexOutOfBoundsException(offset16); 1238 } 1239 count = shift32; 1240 while (result < size && count > 0) { 1241 ch = source.charAt(result); 1242 if (isLeadSurrogate(ch) && ((result + 1) < size) 1243 && isTrailSurrogate(source.charAt(result + 1))) { 1244 result++; 1245 } 1246 count--; 1247 result++; 1248 } 1249 } else { 1250 if (offset16 + shift32 < 0) { 1251 throw new StringIndexOutOfBoundsException(offset16); 1252 } 1253 for (count = -shift32; count > 0; count--) { 1254 result--; 1255 if (result < 0) { 1256 break; 1257 } 1258 ch = source.charAt(result); 1259 if (isTrailSurrogate(ch) && result > 0 1260 && isLeadSurrogate(source.charAt(result - 1))) { 1261 result--; 1262 } 1263 } 1264 } 1265 if (count != 0) { 1266 throw new StringIndexOutOfBoundsException(shift32); 1267 } 1268 return result; 1269 } 1270 1271 /** 1272 * Shifts offset16 by the argument number of codepoints 1273 * 1274 * @param source String buffer 1275 * @param offset16 UTF16 position to shift 1276 * @param shift32 Number of codepoints to shift 1277 * @return new shifted offset16 1278 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds. 1279 * @stable ICU 2.1 1280 */ moveCodePointOffset(StringBuffer source, int offset16, int shift32)1281 public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) { 1282 int result = offset16; 1283 int size = source.length(); 1284 int count; 1285 char ch; 1286 if (offset16 < 0 || offset16 > size) { 1287 throw new StringIndexOutOfBoundsException(offset16); 1288 } 1289 if (shift32 > 0) { 1290 if (shift32 + offset16 > size) { 1291 throw new StringIndexOutOfBoundsException(offset16); 1292 } 1293 count = shift32; 1294 while (result < size && count > 0) { 1295 ch = source.charAt(result); 1296 if (isLeadSurrogate(ch) && ((result + 1) < size) 1297 && isTrailSurrogate(source.charAt(result + 1))) { 1298 result++; 1299 } 1300 count--; 1301 result++; 1302 } 1303 } else { 1304 if (offset16 + shift32 < 0) { 1305 throw new StringIndexOutOfBoundsException(offset16); 1306 } 1307 for (count = -shift32; count > 0; count--) { 1308 result--; 1309 if (result < 0) { 1310 break; 1311 } 1312 ch = source.charAt(result); 1313 if (isTrailSurrogate(ch) && result > 0 1314 && isLeadSurrogate(source.charAt(result - 1))) { 1315 result--; 1316 } 1317 } 1318 } 1319 if (count != 0) { 1320 throw new StringIndexOutOfBoundsException(shift32); 1321 } 1322 return result; 1323 } 1324 1325 /** 1326 * Shifts offset16 by the argument number of codepoints within a subarray. 1327 * 1328 * @param source Char array 1329 * @param start Position of the subarray to be performed on 1330 * @param limit Position of the subarray to be performed on 1331 * @param offset16 UTF16 position to shift relative to start 1332 * @param shift32 Number of codepoints to shift 1333 * @return new shifted offset16 relative to start 1334 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the 1335 * subarray bounds are out of range. 1336 * @stable ICU 2.1 1337 */ moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32)1338 public static int moveCodePointOffset(char source[], int start, int limit, int offset16, 1339 int shift32) { 1340 int size = source.length; 1341 int count; 1342 char ch; 1343 int result = offset16 + start; 1344 if (start < 0 || limit < start) { 1345 throw new StringIndexOutOfBoundsException(start); 1346 } 1347 if (limit > size) { 1348 throw new StringIndexOutOfBoundsException(limit); 1349 } 1350 if (offset16 < 0 || result > limit) { 1351 throw new StringIndexOutOfBoundsException(offset16); 1352 } 1353 if (shift32 > 0) { 1354 if (shift32 + result > size) { 1355 throw new StringIndexOutOfBoundsException(result); 1356 } 1357 count = shift32; 1358 while (result < limit && count > 0) { 1359 ch = source[result]; 1360 if (isLeadSurrogate(ch) && (result + 1 < limit) 1361 && isTrailSurrogate(source[result + 1])) { 1362 result++; 1363 } 1364 count--; 1365 result++; 1366 } 1367 } else { 1368 if (result + shift32 < start) { 1369 throw new StringIndexOutOfBoundsException(result); 1370 } 1371 for (count = -shift32; count > 0; count--) { 1372 result--; 1373 if (result < start) { 1374 break; 1375 } 1376 ch = source[result]; 1377 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { 1378 result--; 1379 } 1380 } 1381 } 1382 if (count != 0) { 1383 throw new StringIndexOutOfBoundsException(shift32); 1384 } 1385 result -= start; 1386 return result; 1387 } 1388 1389 /** 1390 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1391 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1392 * codepoint. The length of target increases by one if codepoint is non-supplementary, 2 1393 * otherwise. 1394 * <p> 1395 * The overall effect is exactly as if the argument were converted to a string by the method 1396 * valueOf(char) and the characters in that string were then inserted into target at the 1397 * position indicated by offset16. 1398 * </p> 1399 * <p> 1400 * The offset argument must be greater than or equal to 0, and less than or equal to the length 1401 * of source. 1402 * 1403 * @param target String buffer to insert to 1404 * @param offset16 Offset which char32 will be inserted in 1405 * @param char32 Codepoint to be inserted 1406 * @return a reference to target 1407 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1408 * @stable ICU 2.1 1409 */ insert(StringBuffer target, int offset16, int char32)1410 public static StringBuffer insert(StringBuffer target, int offset16, int char32) { 1411 String str = valueOf(char32); 1412 if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1413 offset16++; 1414 } 1415 target.insert(offset16, str); 1416 return target; 1417 } 1418 1419 /** 1420 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1421 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1422 * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise. 1423 * <p> 1424 * The overall effect is exactly as if the argument were converted to a string by the method 1425 * valueOf(char) and the characters in that string were then inserted into target at the 1426 * position indicated by offset16. 1427 * </p> 1428 * <p> 1429 * The offset argument must be greater than or equal to 0, and less than or equal to the limit. 1430 * 1431 * @param target Char array to insert to 1432 * @param limit End index of the char array, limit <= target.length 1433 * @param offset16 Offset which char32 will be inserted in 1434 * @param char32 Codepoint to be inserted 1435 * @return new limit size 1436 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1437 * @stable ICU 2.1 1438 */ insert(char target[], int limit, int offset16, int char32)1439 public static int insert(char target[], int limit, int offset16, int char32) { 1440 String str = valueOf(char32); 1441 if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1442 offset16++; 1443 } 1444 int size = str.length(); 1445 if (limit + size > target.length) { 1446 throw new ArrayIndexOutOfBoundsException(offset16 + size); 1447 } 1448 System.arraycopy(target, offset16, target, offset16 + size, limit - offset16); 1449 target[offset16] = str.charAt(0); 1450 if (size == 2) { 1451 target[offset16 + 1] = str.charAt(1); 1452 } 1453 return limit + size; 1454 } 1455 1456 /** 1457 * Removes the codepoint at the specified position in this target (shortening target by 1 1458 * character if the codepoint is a non-supplementary, 2 otherwise). 1459 * 1460 * @param target String buffer to remove codepoint from 1461 * @param offset16 Offset which the codepoint will be removed 1462 * @return a reference to target 1463 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1464 * @stable ICU 2.1 1465 */ delete(StringBuffer target, int offset16)1466 public static StringBuffer delete(StringBuffer target, int offset16) { 1467 int count = 1; 1468 switch (bounds(target, offset16)) { 1469 case LEAD_SURROGATE_BOUNDARY: 1470 count++; 1471 break; 1472 case TRAIL_SURROGATE_BOUNDARY: 1473 count++; 1474 offset16--; 1475 break; 1476 } 1477 target.delete(offset16, offset16 + count); 1478 return target; 1479 } 1480 1481 /** 1482 * Removes the codepoint at the specified position in this target (shortening target by 1 1483 * character if the codepoint is a non-supplementary, 2 otherwise). 1484 * 1485 * @param target String buffer to remove codepoint from 1486 * @param limit End index of the char array, limit <= target.length 1487 * @param offset16 Offset which the codepoint will be removed 1488 * @return a new limit size 1489 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1490 * @stable ICU 2.1 1491 */ delete(char target[], int limit, int offset16)1492 public static int delete(char target[], int limit, int offset16) { 1493 int count = 1; 1494 switch (bounds(target, 0, limit, offset16)) { 1495 case LEAD_SURROGATE_BOUNDARY: 1496 count++; 1497 break; 1498 case TRAIL_SURROGATE_BOUNDARY: 1499 count++; 1500 offset16--; 1501 break; 1502 } 1503 System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count)); 1504 target[limit - count] = 0; 1505 return limit - count; 1506 } 1507 1508 /** 1509 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1510 * the argument codepoint. I.e., the smallest index <code>i</code> such that 1511 * <code>UTF16.charAt(source, i) == 1512 * char32</code> is true. 1513 * <p> 1514 * If no such character occurs in this string, then -1 is returned. 1515 * </p> 1516 * <p> 1517 * Examples:<br> 1518 * UTF16.indexOf("abc", 'a') returns 0<br> 1519 * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1520 * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1521 * </p> 1522 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1523 * characters to its fullest. 1524 * 1525 * @param source UTF16 format Unicode string that will be searched 1526 * @param char32 Codepoint to search for 1527 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1528 * -1 if the codepoint does not occur. 1529 * @stable ICU 2.6 1530 */ indexOf(String source, int char32)1531 public static int indexOf(String source, int char32) { 1532 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1533 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1534 } 1535 // non-surrogate bmp 1536 if (char32 < LEAD_SURROGATE_MIN_VALUE 1537 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1538 return source.indexOf((char) char32); 1539 } 1540 // surrogate 1541 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1542 int result = source.indexOf((char) char32); 1543 if (result >= 0) { 1544 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1545 && isTrailSurrogate(source.charAt(result + 1))) { 1546 return indexOf(source, char32, result + 1); 1547 } 1548 // trail surrogate 1549 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1550 return indexOf(source, char32, result + 1); 1551 } 1552 } 1553 return result; 1554 } 1555 // supplementary 1556 String char32str = toString(char32); 1557 return source.indexOf(char32str); 1558 } 1559 1560 /** 1561 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1562 * the argument string str. This method is implemented based on codepoints, hence a "lead 1563 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1564 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1565 * character before str found at in source will not have a valid match. Vice versa for lead 1566 * surrogates that ends str. See example below. 1567 * <p> 1568 * If no such string str occurs in this source, then -1 is returned. 1569 * </p> 1570 * <p> 1571 * Examples:<br> 1572 * UTF16.indexOf("abc", "ab") returns 0<br> 1573 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1574 * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1575 * </p> 1576 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1577 * characters to its fullest. 1578 * 1579 * @param source UTF16 format Unicode string that will be searched 1580 * @param str UTF16 format Unicode string to search for 1581 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1582 * -1 if the codepoint does not occur. 1583 * @stable ICU 2.6 1584 */ indexOf(String source, String str)1585 public static int indexOf(String source, String str) { 1586 int strLength = str.length(); 1587 // non-surrogate ends 1588 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1589 return source.indexOf(str); 1590 } 1591 1592 int result = source.indexOf(str); 1593 int resultEnd = result + strLength; 1594 if (result >= 0) { 1595 // check last character 1596 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1597 && isTrailSurrogate(source.charAt(resultEnd + 1))) { 1598 return indexOf(source, str, resultEnd + 1); 1599 } 1600 // check first character which is a trail surrogate 1601 if (isTrailSurrogate(str.charAt(0)) && result > 0 1602 && isLeadSurrogate(source.charAt(result - 1))) { 1603 return indexOf(source, str, resultEnd + 1); 1604 } 1605 } 1606 return result; 1607 } 1608 1609 /** 1610 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1611 * the argument codepoint. I.e., the smallest index i such that: <br> 1612 * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true. 1613 * <p> 1614 * If no such character occurs in this string, then -1 is returned. 1615 * </p> 1616 * <p> 1617 * Examples:<br> 1618 * UTF16.indexOf("abc", 'a', 1) returns -1<br> 1619 * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br> 1620 * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br> 1621 * </p> 1622 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1623 * characters to its fullest. 1624 * 1625 * @param source UTF16 format Unicode string that will be searched 1626 * @param char32 Codepoint to search for 1627 * @param fromIndex The index to start the search from. 1628 * @return the index of the first occurrence of the codepoint in the argument Unicode string at 1629 * or after fromIndex, or -1 if the codepoint does not occur. 1630 * @stable ICU 2.6 1631 */ indexOf(String source, int char32, int fromIndex)1632 public static int indexOf(String source, int char32, int fromIndex) { 1633 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1634 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1635 } 1636 // non-surrogate bmp 1637 if (char32 < LEAD_SURROGATE_MIN_VALUE 1638 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1639 return source.indexOf((char) char32, fromIndex); 1640 } 1641 // surrogate 1642 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1643 int result = source.indexOf((char) char32, fromIndex); 1644 if (result >= 0) { 1645 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1646 && isTrailSurrogate(source.charAt(result + 1))) { 1647 return indexOf(source, char32, result + 1); 1648 } 1649 // trail surrogate 1650 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1651 return indexOf(source, char32, result + 1); 1652 } 1653 } 1654 return result; 1655 } 1656 // supplementary 1657 String char32str = toString(char32); 1658 return source.indexOf(char32str, fromIndex); 1659 } 1660 1661 /** 1662 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1663 * the argument string str. This method is implemented based on codepoints, hence a "lead 1664 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1665 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1666 * character before str found at in source will not have a valid match. Vice versa for lead 1667 * surrogates that ends str. See example below. 1668 * <p> 1669 * If no such string str occurs in this source, then -1 is returned. 1670 * </p> 1671 * <p> 1672 * Examples:<br> 1673 * UTF16.indexOf("abc", "ab", 0) returns 0<br> 1674 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br> 1675 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br> 1676 * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br> 1677 * </p> 1678 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1679 * characters to its fullest. 1680 * 1681 * @param source UTF16 format Unicode string that will be searched 1682 * @param str UTF16 format Unicode string to search for 1683 * @param fromIndex The index to start the search from. 1684 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1685 * -1 if the codepoint does not occur. 1686 * @stable ICU 2.6 1687 */ indexOf(String source, String str, int fromIndex)1688 public static int indexOf(String source, String str, int fromIndex) { 1689 int strLength = str.length(); 1690 // non-surrogate ends 1691 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1692 return source.indexOf(str, fromIndex); 1693 } 1694 1695 int result = source.indexOf(str, fromIndex); 1696 int resultEnd = result + strLength; 1697 if (result >= 0) { 1698 // check last character 1699 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1700 && isTrailSurrogate(source.charAt(resultEnd))) { 1701 return indexOf(source, str, resultEnd + 1); 1702 } 1703 // check first character which is a trail surrogate 1704 if (isTrailSurrogate(str.charAt(0)) && result > 0 1705 && isLeadSurrogate(source.charAt(result - 1))) { 1706 return indexOf(source, str, resultEnd + 1); 1707 } 1708 } 1709 return result; 1710 } 1711 1712 /** 1713 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1714 * the argument codepoint. I.e., the index returned is the largest value i such that: 1715 * UTF16.charAt(source, i) == char32 is true. 1716 * <p> 1717 * Examples:<br> 1718 * UTF16.lastIndexOf("abc", 'a') returns 0<br> 1719 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1720 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1721 * </p> 1722 * <p> 1723 * source is searched backwards starting at the last character. 1724 * </p> 1725 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1726 * characters to its fullest. 1727 * 1728 * @param source UTF16 format Unicode string that will be searched 1729 * @param char32 Codepoint to search for 1730 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1731 * does not occur. 1732 * @stable ICU 2.6 1733 */ lastIndexOf(String source, int char32)1734 public static int lastIndexOf(String source, int char32) { 1735 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1736 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1737 } 1738 // non-surrogate bmp 1739 if (char32 < LEAD_SURROGATE_MIN_VALUE 1740 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1741 return source.lastIndexOf((char) char32); 1742 } 1743 // surrogate 1744 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1745 int result = source.lastIndexOf((char) char32); 1746 if (result >= 0) { 1747 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1748 && isTrailSurrogate(source.charAt(result + 1))) { 1749 return lastIndexOf(source, char32, result - 1); 1750 } 1751 // trail surrogate 1752 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1753 return lastIndexOf(source, char32, result - 1); 1754 } 1755 } 1756 return result; 1757 } 1758 // supplementary 1759 String char32str = toString(char32); 1760 return source.lastIndexOf(char32str); 1761 } 1762 1763 /** 1764 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1765 * the argument string str. This method is implemented based on codepoints, hence a "lead 1766 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1767 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1768 * character before str found at in source will not have a valid match. Vice versa for lead 1769 * surrogates that ends str. See example below. 1770 * <p> 1771 * Examples:<br> 1772 * UTF16.lastIndexOf("abc", "a") returns 0<br> 1773 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1774 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1775 * </p> 1776 * <p> 1777 * source is searched backwards starting at the last character. 1778 * </p> 1779 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1780 * characters to its fullest. 1781 * 1782 * @param source UTF16 format Unicode string that will be searched 1783 * @param str UTF16 format Unicode string to search for 1784 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1785 * does not occur. 1786 * @stable ICU 2.6 1787 */ lastIndexOf(String source, String str)1788 public static int lastIndexOf(String source, String str) { 1789 int strLength = str.length(); 1790 // non-surrogate ends 1791 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1792 return source.lastIndexOf(str); 1793 } 1794 1795 int result = source.lastIndexOf(str); 1796 if (result >= 0) { 1797 // check last character 1798 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1799 && isTrailSurrogate(source.charAt(result + strLength + 1))) { 1800 return lastIndexOf(source, str, result - 1); 1801 } 1802 // check first character which is a trail surrogate 1803 if (isTrailSurrogate(str.charAt(0)) && result > 0 1804 && isLeadSurrogate(source.charAt(result - 1))) { 1805 return lastIndexOf(source, str, result - 1); 1806 } 1807 } 1808 return result; 1809 } 1810 1811 /** 1812 * <p> 1813 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1814 * the argument codepoint, where the result is less than or equals to fromIndex. 1815 * </p> 1816 * <p> 1817 * This method is implemented based on codepoints, hence a single surrogate character will not 1818 * match a supplementary character. 1819 * </p> 1820 * <p> 1821 * source is searched backwards starting at the last character starting at the specified index. 1822 * </p> 1823 * <p> 1824 * Examples:<br> 1825 * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br> 1826 * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br> 1827 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br> 1828 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br> 1829 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1830 * </p> 1831 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1832 * characters to its fullest. 1833 * 1834 * @param source UTF16 format Unicode string that will be searched 1835 * @param char32 Codepoint to search for 1836 * @param fromIndex the index to start the search from. There is no restriction on the value of 1837 * fromIndex. If it is greater than or equal to the length of this string, it has the 1838 * same effect as if it were equal to one less than the length of this string: this 1839 * entire string may be searched. If it is negative, it has the same effect as if it 1840 * were -1: -1 is returned. 1841 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1842 * does not occur. 1843 * @stable ICU 2.6 1844 */ lastIndexOf(String source, int char32, int fromIndex)1845 public static int lastIndexOf(String source, int char32, int fromIndex) { 1846 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1847 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1848 } 1849 // non-surrogate bmp 1850 if (char32 < LEAD_SURROGATE_MIN_VALUE 1851 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1852 return source.lastIndexOf((char) char32, fromIndex); 1853 } 1854 // surrogate 1855 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1856 int result = source.lastIndexOf((char) char32, fromIndex); 1857 if (result >= 0) { 1858 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1859 && isTrailSurrogate(source.charAt(result + 1))) { 1860 return lastIndexOf(source, char32, result - 1); 1861 } 1862 // trail surrogate 1863 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1864 return lastIndexOf(source, char32, result - 1); 1865 } 1866 } 1867 return result; 1868 } 1869 // supplementary 1870 String char32str = toString(char32); 1871 return source.lastIndexOf(char32str, fromIndex); 1872 } 1873 1874 /** 1875 * <p> 1876 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1877 * the argument string str, where the result is less than or equals to fromIndex. 1878 * </p> 1879 * <p> 1880 * This method is implemented based on codepoints, hence a "lead surrogate character + trail 1881 * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate 1882 * character at index 0, a source with a leading a surrogate character before str found at in 1883 * source will not have a valid match. Vice versa for lead surrogates that ends str. 1884 * </p> 1885 * See example below. 1886 * <p> 1887 * Examples:<br> 1888 * UTF16.lastIndexOf("abc", "c", 2) returns 2<br> 1889 * UTF16.lastIndexOf("abc", "c", 1) returns -1<br> 1890 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br> 1891 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br> 1892 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br> 1893 * </p> 1894 * <p> 1895 * source is searched backwards starting at the last character. 1896 * </p> 1897 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1898 * characters to its fullest. 1899 * 1900 * @param source UTF16 format Unicode string that will be searched 1901 * @param str UTF16 format Unicode string to search for 1902 * @param fromIndex the index to start the search from. There is no restriction on the value of 1903 * fromIndex. If it is greater than or equal to the length of this string, it has the 1904 * same effect as if it were equal to one less than the length of this string: this 1905 * entire string may be searched. If it is negative, it has the same effect as if it 1906 * were -1: -1 is returned. 1907 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1908 * does not occur. 1909 * @stable ICU 2.6 1910 */ lastIndexOf(String source, String str, int fromIndex)1911 public static int lastIndexOf(String source, String str, int fromIndex) { 1912 int strLength = str.length(); 1913 // non-surrogate ends 1914 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1915 return source.lastIndexOf(str, fromIndex); 1916 } 1917 1918 int result = source.lastIndexOf(str, fromIndex); 1919 if (result >= 0) { 1920 // check last character 1921 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1922 && isTrailSurrogate(source.charAt(result + strLength))) { 1923 return lastIndexOf(source, str, result - 1); 1924 } 1925 // check first character which is a trail surrogate 1926 if (isTrailSurrogate(str.charAt(0)) && result > 0 1927 && isLeadSurrogate(source.charAt(result - 1))) { 1928 return lastIndexOf(source, str, result - 1); 1929 } 1930 } 1931 return result; 1932 } 1933 1934 /** 1935 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of 1936 * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16 1937 * format Unicode string source, then source will be returned. Otherwise, a new String object is 1938 * created that represents a codepoint sequence identical to the codepoint sequence represented 1939 * by source, except that every occurrence of oldChar32 is replaced by an occurrence of 1940 * newChar32. 1941 * <p> 1942 * Examples: <br> 1943 * UTF16.replace("mesquite in your cellar", 'e', 'o');<br> 1944 * returns "mosquito in your collar"<br> 1945 * UTF16.replace("JonL", 'q', 'x');<br> 1946 * returns "JonL" (no change)<br> 1947 * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br> 1948 * returns "Supplementary character !"<br> 1949 * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br> 1950 * returns "Supplementary character \ud800\udc00"<br> 1951 * </p> 1952 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1953 * characters to its fullest. 1954 * 1955 * @param source UTF16 format Unicode string which the codepoint replacements will be based on. 1956 * @param oldChar32 Non-zero old codepoint to be replaced. 1957 * @param newChar32 The new codepoint to replace oldChar32 1958 * @return new String derived from source by replacing every occurrence of oldChar32 with 1959 * newChar32, unless when no oldChar32 is found in source then source will be returned. 1960 * @stable ICU 2.6 1961 */ replace(String source, int oldChar32, int newChar32)1962 public static String replace(String source, int oldChar32, int newChar32) { 1963 if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) { 1964 throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint"); 1965 } 1966 if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) { 1967 throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint"); 1968 } 1969 1970 int index = indexOf(source, oldChar32); 1971 if (index == -1) { 1972 return source; 1973 } 1974 String newChar32Str = toString(newChar32); 1975 int oldChar32Size = 1; 1976 int newChar32Size = newChar32Str.length(); 1977 StringBuffer result = new StringBuffer(source); 1978 int resultIndex = index; 1979 1980 if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) { 1981 oldChar32Size = 2; 1982 } 1983 1984 while (index != -1) { 1985 int endResultIndex = resultIndex + oldChar32Size; 1986 result.replace(resultIndex, endResultIndex, newChar32Str); 1987 int lastEndIndex = index + oldChar32Size; 1988 index = indexOf(source, oldChar32, lastEndIndex); 1989 resultIndex += newChar32Size + index - lastEndIndex; 1990 } 1991 return result.toString(); 1992 } 1993 1994 /** 1995 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr 1996 * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string 1997 * source, then source will be returned. Otherwise, a new String object is created that 1998 * represents a codepoint sequence identical to the codepoint sequence represented by source, 1999 * except that every occurrence of oldStr is replaced by an occurrence of newStr. 2000 * <p> 2001 * Examples: <br> 2002 * UTF16.replace("mesquite in your cellar", "e", "o");<br> 2003 * returns "mosquito in your collar"<br> 2004 * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br> 2005 * returns "cat in your cellar"<br> 2006 * UTF16.replace("JonL", "q", "x");<br> 2007 * returns "JonL" (no change)<br> 2008 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br> 2009 * returns "Supplementary character !"<br> 2010 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br> 2011 * returns "Supplementary character \ud800\udc00"<br> 2012 * </p> 2013 * Note this method is provided as support to jdk 1.3, which does not support supplementary 2014 * characters to its fullest. 2015 * 2016 * @param source UTF16 format Unicode string which the replacements will be based on. 2017 * @param oldStr Non-zero-length string to be replaced. 2018 * @param newStr The new string to replace oldStr 2019 * @return new String derived from source by replacing every occurrence of oldStr with newStr. 2020 * When no oldStr is found in source, then source will be returned. 2021 * @stable ICU 2.6 2022 */ replace(String source, String oldStr, String newStr)2023 public static String replace(String source, String oldStr, String newStr) { 2024 int index = indexOf(source, oldStr); 2025 if (index == -1) { 2026 return source; 2027 } 2028 int oldStrSize = oldStr.length(); 2029 int newStrSize = newStr.length(); 2030 StringBuffer result = new StringBuffer(source); 2031 int resultIndex = index; 2032 2033 while (index != -1) { 2034 int endResultIndex = resultIndex + oldStrSize; 2035 result.replace(resultIndex, endResultIndex, newStr); 2036 int lastEndIndex = index + oldStrSize; 2037 index = indexOf(source, oldStr, lastEndIndex); 2038 resultIndex += newStrSize + index - lastEndIndex; 2039 } 2040 return result.toString(); 2041 } 2042 2043 /** 2044 * Reverses a UTF16 format Unicode string and replaces source's content with it. This method 2045 * will reverse surrogate characters correctly, instead of blindly reversing every character. 2046 * <p> 2047 * Examples:<br> 2048 * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br> 2049 * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS". 2050 * 2051 * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed 2052 * @return a modified source with reversed UTF16 format Unicode string. 2053 * @stable ICU 2.6 2054 */ reverse(StringBuffer source)2055 public static StringBuffer reverse(StringBuffer source) { 2056 int length = source.length(); 2057 StringBuffer result = new StringBuffer(length); 2058 for (int i = length; i-- > 0;) { 2059 char ch = source.charAt(i); 2060 if (isTrailSurrogate(ch) && i > 0) { 2061 char ch2 = source.charAt(i - 1); 2062 if (isLeadSurrogate(ch2)) { 2063 result.append(ch2); 2064 result.append(ch); 2065 --i; 2066 continue; 2067 } 2068 } 2069 result.append(ch); 2070 } 2071 return result; 2072 } 2073 2074 /** 2075 * Check if the string contains more Unicode code points than a certain number. This is more 2076 * efficient than counting all code points in the entire string and comparing that number with a 2077 * threshold. This function may not need to scan the string at all if the length is within a 2078 * certain range, and never needs to count more than 'number + 1' code points. Logically 2079 * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two 2080 * code units. 2081 * 2082 * @param source The input string. 2083 * @param number The number of code points in the string is compared against the 'number' 2084 * parameter. 2085 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2086 * @stable ICU 2.4 2087 */ hasMoreCodePointsThan(String source, int number)2088 public static boolean hasMoreCodePointsThan(String source, int number) { 2089 if (number < 0) { 2090 return true; 2091 } 2092 if (source == null) { 2093 return false; 2094 } 2095 int length = source.length(); 2096 2097 // length >= 0 known 2098 // source contains at least (length + 1) / 2 code points: <= 2 2099 // chars per cp 2100 if (((length + 1) >> 1) > number) { 2101 return true; 2102 } 2103 2104 // check if source does not even contain enough chars 2105 int maxsupplementary = length - number; 2106 if (maxsupplementary <= 0) { 2107 return false; 2108 } 2109 2110 // there are maxsupplementary = length - number more chars than 2111 // asked-for code points 2112 2113 // count code points until they exceed and also check that there are 2114 // no more than maxsupplementary supplementary code points (char pairs) 2115 int start = 0; 2116 while (true) { 2117 if (length == 0) { 2118 return false; 2119 } 2120 if (number == 0) { 2121 return true; 2122 } 2123 if (isLeadSurrogate(source.charAt(start++)) && start != length 2124 && isTrailSurrogate(source.charAt(start))) { 2125 start++; 2126 if (--maxsupplementary <= 0) { 2127 // too many pairs - too few code points 2128 return false; 2129 } 2130 } 2131 --number; 2132 } 2133 } 2134 2135 /** 2136 * Check if the sub-range of char array, from argument start to limit, contains more Unicode 2137 * code points than a certain number. This is more efficient than counting all code points in 2138 * the entire char array range and comparing that number with a threshold. This function may not 2139 * need to scan the char array at all if start and limit is within a certain range, and never 2140 * needs to count more than 'number + 1' code points. Logically equivalent to 2141 * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one 2142 * or two code units. 2143 * 2144 * @param source Array of UTF-16 chars 2145 * @param start Offset to substring in the source array for analyzing 2146 * @param limit Offset to substring in the source array for analyzing 2147 * @param number The number of code points in the string is compared against the 'number' 2148 * parameter. 2149 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2150 * @exception IndexOutOfBoundsException Thrown when limit < start 2151 * @stable ICU 2.4 2152 */ hasMoreCodePointsThan(char source[], int start, int limit, int number)2153 public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) { 2154 int length = limit - start; 2155 if (length < 0 || start < 0 || limit < 0) { 2156 throw new IndexOutOfBoundsException( 2157 "Start and limit indexes should be non-negative and start <= limit"); 2158 } 2159 if (number < 0) { 2160 return true; 2161 } 2162 if (source == null) { 2163 return false; 2164 } 2165 2166 // length >= 0 known 2167 // source contains at least (length + 1) / 2 code points: <= 2 2168 // chars per cp 2169 if (((length + 1) >> 1) > number) { 2170 return true; 2171 } 2172 2173 // check if source does not even contain enough chars 2174 int maxsupplementary = length - number; 2175 if (maxsupplementary <= 0) { 2176 return false; 2177 } 2178 2179 // there are maxsupplementary = length - number more chars than 2180 // asked-for code points 2181 2182 // count code points until they exceed and also check that there are 2183 // no more than maxsupplementary supplementary code points (char pairs) 2184 while (true) { 2185 if (length == 0) { 2186 return false; 2187 } 2188 if (number == 0) { 2189 return true; 2190 } 2191 if (isLeadSurrogate(source[start++]) && start != limit 2192 && isTrailSurrogate(source[start])) { 2193 start++; 2194 if (--maxsupplementary <= 0) { 2195 // too many pairs - too few code points 2196 return false; 2197 } 2198 } 2199 --number; 2200 } 2201 } 2202 2203 /** 2204 * Check if the string buffer contains more Unicode code points than a certain number. This is 2205 * more efficient than counting all code points in the entire string buffer and comparing that 2206 * number with a threshold. This function may not need to scan the string buffer at all if the 2207 * length is within a certain range, and never needs to count more than 'number + 1' code 2208 * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may 2209 * occupy either one or two code units. 2210 * 2211 * @param source The input string buffer. 2212 * @param number The number of code points in the string buffer is compared against the 'number' 2213 * parameter. 2214 * @return boolean value for whether the string buffer contains more Unicode code points than 2215 * 'number'. 2216 * @stable ICU 2.4 2217 */ hasMoreCodePointsThan(StringBuffer source, int number)2218 public static boolean hasMoreCodePointsThan(StringBuffer source, int number) { 2219 if (number < 0) { 2220 return true; 2221 } 2222 if (source == null) { 2223 return false; 2224 } 2225 int length = source.length(); 2226 2227 // length >= 0 known 2228 // source contains at least (length + 1) / 2 code points: <= 2 2229 // chars per cp 2230 if (((length + 1) >> 1) > number) { 2231 return true; 2232 } 2233 2234 // check if source does not even contain enough chars 2235 int maxsupplementary = length - number; 2236 if (maxsupplementary <= 0) { 2237 return false; 2238 } 2239 2240 // there are maxsupplementary = length - number more chars than 2241 // asked-for code points 2242 2243 // count code points until they exceed and also check that there are 2244 // no more than maxsupplementary supplementary code points (char pairs) 2245 int start = 0; 2246 while (true) { 2247 if (length == 0) { 2248 return false; 2249 } 2250 if (number == 0) { 2251 return true; 2252 } 2253 if (isLeadSurrogate(source.charAt(start++)) && start != length 2254 && isTrailSurrogate(source.charAt(start))) { 2255 start++; 2256 if (--maxsupplementary <= 0) { 2257 // too many pairs - too few code points 2258 return false; 2259 } 2260 } 2261 --number; 2262 } 2263 } 2264 2265 /** 2266 * Cover JDK 1.5 API. Create a String from an array of codePoints. 2267 * 2268 * @param codePoints The code array 2269 * @param offset The start of the text in the code point array 2270 * @param count The number of code points 2271 * @return a String representing the code points between offset and count 2272 * @throws IllegalArgumentException If an invalid code point is encountered 2273 * @throws IndexOutOfBoundsException If the offset or count are out of bounds. 2274 * @stable ICU 3.0 2275 */ newString(int[] codePoints, int offset, int count)2276 public static String newString(int[] codePoints, int offset, int count) { 2277 if (count < 0) { 2278 throw new IllegalArgumentException(); 2279 } 2280 char[] chars = new char[count]; 2281 int w = 0; 2282 for (int r = offset, e = offset + count; r < e; ++r) { 2283 int cp = codePoints[r]; 2284 if (cp < 0 || cp > 0x10ffff) { 2285 throw new IllegalArgumentException(); 2286 } 2287 while (true) { 2288 try { 2289 if (cp < 0x010000) { 2290 chars[w] = (char) cp; 2291 w++; 2292 } else { 2293 chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); 2294 chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); 2295 w += 2; 2296 } 2297 break; 2298 } catch (IndexOutOfBoundsException ex) { 2299 int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2) 2300 / (r - offset + 1))); 2301 char[] temp = new char[newlen]; 2302 System.arraycopy(chars, 0, temp, 0, w); 2303 chars = temp; 2304 } 2305 } 2306 } 2307 return new String(chars, 0, w); 2308 } 2309 2310 /** 2311 * <p> 2312 * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various 2313 * modes 2314 * </p> 2315 * <ul> 2316 * <li> Code point comparison or code unit comparison 2317 * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison 2318 * with special handling for character 'i'. 2319 * </ul> 2320 * <p> 2321 * The code unit or code point comparison differ only when comparing supplementary code points 2322 * (\u10000..\u10ffff) to BMP code points near the end of the BMP (i.e., 2323 * \ue000..\uffff). In code unit comparison, high BMP code points sort after 2324 * supplementary code points because they are stored as pairs of surrogates which are at 2325 * \ud800..\udfff. 2326 * </p> 2327 * 2328 * @see #FOLD_CASE_DEFAULT 2329 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2330 * @stable ICU 2.1 2331 */ 2332 public static final class StringComparator implements java.util.Comparator<String> { 2333 // public constructor ------------------------------------------------ 2334 2335 /** 2336 * Default constructor that does code unit comparison and case sensitive comparison. 2337 * 2338 * @stable ICU 2.1 2339 */ StringComparator()2340 public StringComparator() { 2341 this(false, false, FOLD_CASE_DEFAULT); 2342 } 2343 2344 /** 2345 * Constructor that does comparison based on the argument options. 2346 * 2347 * @param codepointcompare Flag to indicate true for code point comparison or false for code unit 2348 * comparison. 2349 * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison 2350 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2351 * when ignorecase is set to true. If ignorecase is false, this option is 2352 * ignored. 2353 * @see #FOLD_CASE_DEFAULT 2354 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2355 * @throws IllegalArgumentException If foldcaseoption is out of range 2356 * @stable ICU 2.4 2357 */ StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption)2358 public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) { 2359 setCodePointCompare(codepointcompare); 2360 m_ignoreCase_ = ignorecase; 2361 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2362 throw new IllegalArgumentException("Invalid fold case option"); 2363 } 2364 m_foldCase_ = foldcaseoption; 2365 } 2366 2367 // public data member ------------------------------------------------ 2368 2369 /** 2370 * Option value for case folding comparison: 2371 * 2372 * <p>Comparison is case insensitive, strings are folded using default mappings defined in 2373 * Unicode data file CaseFolding.txt, before comparison. 2374 * 2375 * @stable ICU 2.4 2376 */ 2377 public static final int FOLD_CASE_DEFAULT = 0; 2378 2379 /** 2380 * Option value for case folding: 2381 * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I 2382 * and dotless i appropriately for Turkic languages (tr, az). 2383 * 2384 * <p>Comparison is case insensitive, strings are folded using modified mappings defined in 2385 * Unicode data file CaseFolding.txt, before comparison. 2386 * 2387 * @stable ICU 2.4 2388 * @see com.ibm.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I 2389 */ 2390 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1; 2391 2392 // public methods ---------------------------------------------------- 2393 2394 // public setters ---------------------------------------------------- 2395 2396 /** 2397 * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode 2398 * is set to code unit compare 2399 * 2400 * @param flag True for code point compare, false for code unit compare 2401 * @stable ICU 2.4 2402 */ setCodePointCompare(boolean flag)2403 public void setCodePointCompare(boolean flag) { 2404 if (flag) { 2405 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER; 2406 } else { 2407 m_codePointCompare_ = 0; 2408 } 2409 } 2410 2411 /** 2412 * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise 2413 * case sensitive comparison mode if set to false. 2414 * 2415 * @param ignorecase True for case-insitive comparison, false for case sensitive comparison 2416 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2417 * when ignorecase is set to true. If ignorecase is false, this option is 2418 * ignored. 2419 * @see #FOLD_CASE_DEFAULT 2420 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2421 * @stable ICU 2.4 2422 */ setIgnoreCase(boolean ignorecase, int foldcaseoption)2423 public void setIgnoreCase(boolean ignorecase, int foldcaseoption) { 2424 m_ignoreCase_ = ignorecase; 2425 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2426 throw new IllegalArgumentException("Invalid fold case option"); 2427 } 2428 m_foldCase_ = foldcaseoption; 2429 } 2430 2431 // public getters ---------------------------------------------------- 2432 2433 /** 2434 * Checks if the comparison mode is code point compare. 2435 * 2436 * @return true for code point compare, false for code unit compare 2437 * @stable ICU 2.4 2438 */ getCodePointCompare()2439 public boolean getCodePointCompare() { 2440 return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2441 } 2442 2443 /** 2444 * Checks if Comparator is in the case insensitive mode. 2445 * 2446 * @return true if Comparator performs case insensitive comparison, false otherwise 2447 * @stable ICU 2.4 2448 */ getIgnoreCase()2449 public boolean getIgnoreCase() { 2450 return m_ignoreCase_; 2451 } 2452 2453 /** 2454 * Gets the fold case options set in Comparator to be used with case insensitive comparison. 2455 * 2456 * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I 2457 * @see #FOLD_CASE_DEFAULT 2458 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2459 * @stable ICU 2.4 2460 */ getIgnoreCaseOption()2461 public int getIgnoreCaseOption() { 2462 return m_foldCase_; 2463 } 2464 2465 // public other methods ---------------------------------------------- 2466 2467 /** 2468 * Compare two strings depending on the options selected during construction. 2469 * 2470 * @param a first source string. 2471 * @param b second source string. 2472 * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b, 2473 * a positive value is returned. 2474 * @exception ClassCastException thrown when either a or b is not a String object 2475 * @stable ICU 4.4 2476 */ compare(String a, String b)2477 public int compare(String a, String b) { 2478 if (a == b) { 2479 return 0; 2480 } 2481 if (a == null) { 2482 return -1; 2483 } 2484 if (b == null) { 2485 return 1; 2486 } 2487 2488 if (m_ignoreCase_) { 2489 return compareCaseInsensitive(a, b); 2490 } 2491 return compareCaseSensitive(a, b); 2492 } 2493 2494 // private data member ---------------------------------------------- 2495 2496 /** 2497 * Code unit comparison flag. True if code unit comparison is required. False if code point 2498 * comparison is required. 2499 */ 2500 private int m_codePointCompare_; 2501 2502 /** 2503 * Fold case comparison option. 2504 */ 2505 private int m_foldCase_; 2506 2507 /** 2508 * Flag indicator if ignore case is to be used during comparison 2509 */ 2510 private boolean m_ignoreCase_; 2511 2512 /** 2513 * Code point order offset for surrogate characters 2514 */ 2515 private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800; 2516 2517 // private method --------------------------------------------------- 2518 2519 /** 2520 * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life 2521 * easier. 2522 * 2523 * @param s1 2524 * first string to compare 2525 * @param s2 2526 * second string to compare 2527 * @return -1 is s1 < s2, 0 if equals, 2528 */ compareCaseInsensitive(String s1, String s2)2529 private int compareCaseInsensitive(String s1, String s2) { 2530 return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_ 2531 | Normalizer.COMPARE_IGNORE_CASE); 2532 } 2533 2534 /** 2535 * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life 2536 * easier. 2537 * 2538 * @param s1 2539 * first string to compare 2540 * @param s2 2541 * second string to compare 2542 * @return -1 is s1 < s2, 0 if equals, 2543 */ compareCaseSensitive(String s1, String s2)2544 private int compareCaseSensitive(String s1, String s2) { 2545 // compare identical prefixes - they do not need to be fixed up 2546 // limit1 = start1 + min(lenght1, length2) 2547 int length1 = s1.length(); 2548 int length2 = s2.length(); 2549 int minlength = length1; 2550 int result = 0; 2551 if (length1 < length2) { 2552 result = -1; 2553 } else if (length1 > length2) { 2554 result = 1; 2555 minlength = length2; 2556 } 2557 2558 char c1 = 0; 2559 char c2 = 0; 2560 int index = 0; 2561 for (; index < minlength; index++) { 2562 c1 = s1.charAt(index); 2563 c2 = s2.charAt(index); 2564 // check pseudo-limit 2565 if (c1 != c2) { 2566 break; 2567 } 2568 } 2569 2570 if (index == minlength) { 2571 return result; 2572 } 2573 2574 boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2575 // if both values are in or above the surrogate range, fix them up 2576 if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE 2577 && codepointcompare) { 2578 // subtract 0x2800 from BMP code points to make them smaller 2579 // than supplementary ones 2580 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1))) 2581 || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) { 2582 // part of a surrogate pair, leave >=d800 2583 } else { 2584 // BMP code point - may be surrogate code point - make 2585 // < d800 2586 c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2587 } 2588 2589 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1))) 2590 || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) { 2591 // part of a surrogate pair, leave >=d800 2592 } else { 2593 // BMP code point - may be surrogate code point - make <d800 2594 c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2595 } 2596 } 2597 2598 // now c1 and c2 are in UTF-32-compatible order 2599 return c1 - c2; 2600 } 2601 } 2602 2603 /** 2604 * Utility for getting a code point from a CharSequence that contains exactly one code point. 2605 * @return the code point IF the string is non-null and consists of a single code point. 2606 * otherwise returns -1. 2607 * @param s to test 2608 * @stable ICU 54 2609 */ getSingleCodePoint(CharSequence s)2610 public static int getSingleCodePoint(CharSequence s) { 2611 if (s == null || s.length() == 0) { 2612 return -1; 2613 } else if (s.length() == 1) { 2614 return s.charAt(0); 2615 } else if (s.length() > 2) { 2616 return -1; 2617 } 2618 2619 // at this point, len = 2 2620 int cp = Character.codePointAt(s, 0); 2621 if (cp > 0xFFFF) { // is surrogate pair 2622 return cp; 2623 } 2624 return -1; 2625 } 2626 2627 /** 2628 * Utility for comparing a code point to a string without having to create a new string. Returns the same results 2629 * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if 2630 * <pre> 2631 * sc = new StringComparator(true,false,0); 2632 * fast = UTF16.compareCodePoint(codePoint, charSequence) 2633 * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString()) 2634 * </pre> 2635 * then 2636 * <pre> 2637 * Integer.signum(fast) == Integer.signum(slower) 2638 * </pre> 2639 * @param codePoint to test 2640 * @param s to test 2641 * @return equivalent of code point comparator comparing two strings. 2642 * @stable ICU 54 2643 */ compareCodePoint(int codePoint, CharSequence s)2644 public static int compareCodePoint(int codePoint, CharSequence s) { 2645 if (s == null) { 2646 return 1; 2647 } 2648 final int strLen = s.length(); 2649 if (strLen == 0) { 2650 return 1; 2651 } 2652 int second = Character.codePointAt(s, 0); 2653 int diff = codePoint - second; 2654 if (diff != 0) { 2655 return diff; 2656 } 2657 return strLen == Character.charCount(codePoint) ? 0 : -1; 2658 } 2659 2660 // private data members ------------------------------------------------- 2661 2662 /** 2663 * Shift value for lead surrogate to form a supplementary character. 2664 */ 2665 private static final int LEAD_SURROGATE_SHIFT_ = 10; 2666 2667 /** 2668 * Mask to retrieve the significant value from a trail surrogate. 2669 */ 2670 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; 2671 2672 /** 2673 * Value that all lead surrogate starts with 2674 */ 2675 private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE 2676 - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); 2677 2678 // private methods ------------------------------------------------------ 2679 2680 /** 2681 * <p> 2682 * Converts argument code point and returns a String object representing the code point's value 2683 * in UTF16 format. 2684 * </p> 2685 * <p> 2686 * This method does not check for the validity of the codepoint, the results are not guaranteed 2687 * if a invalid codepoint is passed as argument. 2688 * </p> 2689 * <p> 2690 * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise. 2691 * </p> 2692 * 2693 * @param ch 2694 * code point 2695 * @return string representation of the code point 2696 */ toString(int ch)2697 private static String toString(int ch) { 2698 if (ch < SUPPLEMENTARY_MIN_VALUE) { 2699 return String.valueOf((char) ch); 2700 } 2701 2702 StringBuilder result = new StringBuilder(); 2703 result.append(getLeadSurrogate(ch)); 2704 result.append(getTrailSurrogate(ch)); 2705 return result.toString(); 2706 } 2707 } 2708 // eof 2709