1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.text; 11 12 import com.ibm.icu.impl.Utility; 13 14 /** 15 * <p> 16 * Standalone utility class providing UTF16 character conversions and indexing conversions. 17 * </p> 18 * <p> 19 * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap, 20 * so searching for strings is a safe operation. Similarly, concatenation is always safe. 21 * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the 22 * values for start and end are on those boundaries, since they arose from operations like 23 * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>. 24 * </p> 25 * <strong>Examples:</strong> 26 * <p> 27 * The following examples illustrate use of some of these methods. 28 * 29 * <pre> 30 * // iteration forwards: Original 31 * for (int i = 0; i < s.length(); ++i) { 32 * char ch = s.charAt(i); 33 * doSomethingWith(ch); 34 * } 35 * 36 * // iteration forwards: Changes for UTF-32 37 * int ch; 38 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { 39 * ch = UTF16.charAt(s, i); 40 * doSomethingWith(ch); 41 * } 42 * 43 * // iteration backwards: Original 44 * for (int i = s.length() - 1; i >= 0; --i) { 45 * char ch = s.charAt(i); 46 * doSomethingWith(ch); 47 * } 48 * 49 * // iteration backwards: Changes for UTF-32 50 * int ch; 51 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { 52 * ch = UTF16.charAt(s, i); 53 * doSomethingWith(ch); 54 * } 55 * </pre> 56 * 57 * <strong>Notes:</strong> 58 * <ul> 59 * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code> 60 * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string. 61 * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16 62 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32 63 * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li> 64 * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a 65 * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16 66 * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>. 67 * </li> 68 * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out 69 * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates 70 * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to 71 * check for validity if desired. </li> 72 * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then 73 * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It 74 * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4, 75 * 5.5). </li> 76 * <li> <strong>Optimization:</strong> The method implementations may need optimization if the 77 * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small 78 * percentage of all the text in the world, the singleton case should always be optimized for. </li> 79 * </ul> 80 * 81 * @author Mark Davis, with help from Markus Scherer 82 * @stable ICU 2.1 83 */ 84 85 public final class UTF16 { 86 // public variables --------------------------------------------------- 87 88 /** 89 * Value returned in {@link #bounds(String, int) bounds()}. 90 * These values are chosen specifically so that it actually represents the position of the 91 * character [offset16 - (value >> 2), offset16 + (value & 3)] 92 * 93 * @stable ICU 2.1 94 */ 95 public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2, 96 TRAIL_SURROGATE_BOUNDARY = 5; 97 98 /** 99 * The lowest Unicode code point value. 100 * 101 * @stable ICU 2.1 102 */ 103 public static final int CODEPOINT_MIN_VALUE = 0; 104 105 /** 106 * The highest Unicode code point value (scalar value) according to the Unicode Standard. 107 * 108 * @stable ICU 2.1 109 */ 110 public static final int CODEPOINT_MAX_VALUE = 0x10ffff; 111 112 /** 113 * The minimum value for Supplementary code points 114 * 115 * @stable ICU 2.1 116 */ 117 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 118 119 /** 120 * Lead surrogate minimum value 121 * 122 * @stable ICU 2.1 123 */ 124 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 125 126 /** 127 * Trail surrogate minimum value 128 * 129 * @stable ICU 2.1 130 */ 131 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 132 133 /** 134 * Lead surrogate maximum value 135 * 136 * @stable ICU 2.1 137 */ 138 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 139 140 /** 141 * Trail surrogate maximum value 142 * 143 * @stable ICU 2.1 144 */ 145 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 146 147 /** 148 * Surrogate minimum value 149 * 150 * @stable ICU 2.1 151 */ 152 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; 153 154 /** 155 * Maximum surrogate value 156 * 157 * @stable ICU 2.1 158 */ 159 public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE; 160 161 /** 162 * Lead surrogate bitmask 163 */ 164 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; 165 166 /** 167 * Trail surrogate bitmask 168 */ 169 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; 170 171 /** 172 * Surrogate bitmask 173 */ 174 private static final int SURROGATE_BITMASK = 0xFFFFF800; 175 176 /** 177 * Lead surrogate bits 178 */ 179 private static final int LEAD_SURROGATE_BITS = 0xD800; 180 181 /** 182 * Trail surrogate bits 183 */ 184 private static final int TRAIL_SURROGATE_BITS = 0xDC00; 185 186 /** 187 * Surrogate bits 188 */ 189 private static final int SURROGATE_BITS = 0xD800; 190 191 // constructor -------------------------------------------------------- 192 193 // /CLOVER:OFF 194 /** 195 * Prevent instance from being created. 196 */ UTF16()197 private UTF16() { 198 } 199 200 // /CLOVER:ON 201 // public method ------------------------------------------------------ 202 203 /** 204 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 205 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 206 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 207 * UCharacter.isLegal()</a></code> 208 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 209 * character will be returned. If a complete supplementary character is not found the incomplete 210 * character will be returned 211 * 212 * @param source Array of UTF-16 chars 213 * @param offset16 UTF-16 offset to the start of the character. 214 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 215 * of that codepoint are the same as in <code>bounds32()</code>. 216 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 217 * @stable ICU 2.1 218 */ charAt(String source, int offset16)219 public static int charAt(String source, int offset16) { 220 char single = source.charAt(offset16); 221 if (single < LEAD_SURROGATE_MIN_VALUE) { 222 return single; 223 } 224 return _charAt(source, offset16, single); 225 } 226 _charAt(String source, int offset16, char single)227 private static int _charAt(String source, int offset16, char single) { 228 if (single > TRAIL_SURROGATE_MAX_VALUE) { 229 return single; 230 } 231 232 // Convert the UTF-16 surrogate pair if necessary. 233 // For simplicity in usage, and because the frequency of pairs is 234 // low, look both directions. 235 236 if (single <= LEAD_SURROGATE_MAX_VALUE) { 237 ++offset16; 238 if (source.length() != offset16) { 239 char trail = source.charAt(offset16); 240 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { 241 return Character.toCodePoint(single, trail); 242 } 243 } 244 } else { 245 --offset16; 246 if (offset16 >= 0) { 247 // single is a trail surrogate so 248 char lead = source.charAt(offset16); 249 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { 250 return Character.toCodePoint(lead, single); 251 } 252 } 253 } 254 return single; // return unmatched surrogate 255 } 256 257 /** 258 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 259 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 260 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 261 * UCharacter.isLegal()</a></code> 262 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 263 * character will be returned. If a complete supplementary character is not found the incomplete 264 * character will be returned 265 * 266 * @param source Array of UTF-16 chars 267 * @param offset16 UTF-16 offset to the start of the character. 268 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 269 * of that codepoint are the same as in <code>bounds32()</code>. 270 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 271 * @stable ICU 2.1 272 */ charAt(CharSequence source, int offset16)273 public static int charAt(CharSequence source, int offset16) { 274 char single = source.charAt(offset16); 275 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { 276 return single; 277 } 278 return _charAt(source, offset16, single); 279 } 280 _charAt(CharSequence source, int offset16, char single)281 private static int _charAt(CharSequence source, int offset16, char single) { 282 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { 283 return single; 284 } 285 286 // Convert the UTF-16 surrogate pair if necessary. 287 // For simplicity in usage, and because the frequency of pairs is 288 // low, look both directions. 289 290 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 291 ++offset16; 292 if (source.length() != offset16) { 293 char trail = source.charAt(offset16); 294 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE 295 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { 296 return Character.toCodePoint(single, trail); 297 } 298 } 299 } else { 300 --offset16; 301 if (offset16 >= 0) { 302 // single is a trail surrogate so 303 char lead = source.charAt(offset16); 304 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE 305 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 306 return Character.toCodePoint(lead, single); 307 } 308 } 309 } 310 return single; // return unmatched surrogate 311 } 312 313 /** 314 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 315 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 316 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 317 * </a></code> 318 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 319 * character will be returned. If a complete supplementary character is not found the incomplete 320 * character will be returned 321 * 322 * @param source UTF-16 chars string buffer 323 * @param offset16 UTF-16 offset to the start of the character. 324 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 325 * of that codepoint are the same as in <code>bounds32()</code>. 326 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 327 * @stable ICU 2.1 328 */ charAt(StringBuffer source, int offset16)329 public static int charAt(StringBuffer source, int offset16) { 330 if (offset16 < 0 || offset16 >= source.length()) { 331 throw new StringIndexOutOfBoundsException(offset16); 332 } 333 334 char single = source.charAt(offset16); 335 if (!isSurrogate(single)) { 336 return single; 337 } 338 339 // Convert the UTF-16 surrogate pair if necessary. 340 // For simplicity in usage, and because the frequency of pairs is 341 // low, look both directions. 342 343 if (single <= LEAD_SURROGATE_MAX_VALUE) { 344 ++offset16; 345 if (source.length() != offset16) { 346 char trail = source.charAt(offset16); 347 if (isTrailSurrogate(trail)) 348 return Character.toCodePoint(single, trail); 349 } 350 } else { 351 --offset16; 352 if (offset16 >= 0) { 353 // single is a trail surrogate so 354 char lead = source.charAt(offset16); 355 if (isLeadSurrogate(lead)) { 356 return Character.toCodePoint(lead, single); 357 } 358 } 359 } 360 return single; // return unmatched surrogate 361 } 362 363 /** 364 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards 365 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 366 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 367 * </a></code> 368 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 369 * character will be returned. If a complete supplementary character is not found the incomplete 370 * character will be returned 371 * 372 * @param source Array of UTF-16 chars 373 * @param start Offset to substring in the source array for analyzing 374 * @param limit Offset to substring in the source array for analyzing 375 * @param offset16 UTF-16 offset relative to start 376 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 377 * of that codepoint are the same as in <code>bounds32()</code>. 378 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit. 379 * @stable ICU 2.1 380 */ charAt(char source[], int start, int limit, int offset16)381 public static int charAt(char source[], int start, int limit, int offset16) { 382 offset16 += start; 383 if (offset16 < start || offset16 >= limit) { 384 throw new ArrayIndexOutOfBoundsException(offset16); 385 } 386 387 char single = source[offset16]; 388 if (!isSurrogate(single)) { 389 return single; 390 } 391 392 // Convert the UTF-16 surrogate pair if necessary. 393 // For simplicity in usage, and because the frequency of pairs is 394 // low, look both directions. 395 if (single <= LEAD_SURROGATE_MAX_VALUE) { 396 offset16++; 397 if (offset16 >= limit) { 398 return single; 399 } 400 char trail = source[offset16]; 401 if (isTrailSurrogate(trail)) { 402 return Character.toCodePoint(single, trail); 403 } 404 } else { // isTrailSurrogate(single), so 405 if (offset16 == start) { 406 return single; 407 } 408 offset16--; 409 char lead = source[offset16]; 410 if (isLeadSurrogate(lead)) 411 return Character.toCodePoint(lead, single); 412 } 413 return single; // return unmatched surrogate 414 } 415 416 /** 417 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 418 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 419 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 420 * </a></code> 421 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 422 * character will be returned. If a complete supplementary character is not found the incomplete 423 * character will be returned 424 * 425 * @param source UTF-16 chars string buffer 426 * @param offset16 UTF-16 offset to the start of the character. 427 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 428 * of that codepoint are the same as in <code>bounds32()</code>. 429 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 430 * @stable ICU 2.1 431 */ charAt(Replaceable source, int offset16)432 public static int charAt(Replaceable source, int offset16) { 433 if (offset16 < 0 || offset16 >= source.length()) { 434 throw new StringIndexOutOfBoundsException(offset16); 435 } 436 437 char single = source.charAt(offset16); 438 if (!isSurrogate(single)) { 439 return single; 440 } 441 442 // Convert the UTF-16 surrogate pair if necessary. 443 // For simplicity in usage, and because the frequency of pairs is 444 // low, look both directions. 445 446 if (single <= LEAD_SURROGATE_MAX_VALUE) { 447 ++offset16; 448 if (source.length() != offset16) { 449 char trail = source.charAt(offset16); 450 if (isTrailSurrogate(trail)) 451 return Character.toCodePoint(single, trail); 452 } 453 } else { 454 --offset16; 455 if (offset16 >= 0) { 456 // single is a trail surrogate so 457 char lead = source.charAt(offset16); 458 if (isLeadSurrogate(lead)) { 459 return Character.toCodePoint(lead, single); 460 } 461 } 462 } 463 return single; // return unmatched surrogate 464 } 465 466 /** 467 * Determines how many chars this char32 requires. If a validity check is required, use <code> 468 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 469 * on char32 before calling. 470 * 471 * @param char32 The input codepoint. 472 * @return 2 if is in supplementary space, otherwise 1. 473 * @stable ICU 2.1 474 */ getCharCount(int char32)475 public static int getCharCount(int char32) { 476 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 477 return 1; 478 } 479 return 2; 480 } 481 482 /** 483 * Returns the type of the boundaries around the char at offset16. Used for random access. 484 * 485 * @param source Text to analyse 486 * @param offset16 UTF-16 offset 487 * @return 488 * <ul> 489 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1] 490 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 491 * are [offset16, offset16 + 2] 492 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 493 * bounds are [offset16 - 1, offset16 + 1] 494 * </ul> 495 * For bit-twiddlers, the return values for these are chosen so that the boundaries 496 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 497 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 498 * @stable ICU 2.1 499 */ bounds(String source, int offset16)500 public static int bounds(String source, int offset16) { 501 char ch = source.charAt(offset16); 502 if (isSurrogate(ch)) { 503 if (isLeadSurrogate(ch)) { 504 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 505 return LEAD_SURROGATE_BOUNDARY; 506 } 507 } else { 508 // isTrailSurrogate(ch), so 509 --offset16; 510 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 511 return TRAIL_SURROGATE_BOUNDARY; 512 } 513 } 514 } 515 return SINGLE_CHAR_BOUNDARY; 516 } 517 518 /** 519 * Returns the type of the boundaries around the char at offset16. Used for random access. 520 * 521 * @param source String buffer to analyse 522 * @param offset16 UTF16 offset 523 * @return 524 * <ul> 525 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1] 526 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 527 * are [offset16, offset16 + 2] 528 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 529 * bounds are [offset16 - 1, offset16 + 1] 530 * </ul> 531 * For bit-twiddlers, the return values for these are chosen so that the boundaries 532 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 533 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 534 * @stable ICU 2.1 535 */ bounds(StringBuffer source, int offset16)536 public static int bounds(StringBuffer source, int offset16) { 537 char ch = source.charAt(offset16); 538 if (isSurrogate(ch)) { 539 if (isLeadSurrogate(ch)) { 540 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 541 return LEAD_SURROGATE_BOUNDARY; 542 } 543 } else { 544 // isTrailSurrogate(ch), so 545 --offset16; 546 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 547 return TRAIL_SURROGATE_BOUNDARY; 548 } 549 } 550 } 551 return SINGLE_CHAR_BOUNDARY; 552 } 553 554 /** 555 * Returns the type of the boundaries around the char at offset16. Used for random access. Note 556 * that the boundaries are determined with respect to the subarray, hence the char array 557 * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1. 558 * 559 * @param source Char array to analyse 560 * @param start Offset to substring in the source array for analyzing 561 * @param limit Offset to substring in the source array for analyzing 562 * @param offset16 UTF16 offset relative to start 563 * @return 564 * <ul> 565 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are 566 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 567 * are [offset16, offset16 + 2] 568 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 569 * bounds are [offset16 - 1, offset16 + 1] 570 * </ul> 571 * For bit-twiddlers, the boundary values for these are chosen so that the boundaries 572 * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)]. 573 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 574 * @stable ICU 2.1 575 */ bounds(char source[], int start, int limit, int offset16)576 public static int bounds(char source[], int start, int limit, int offset16) { 577 offset16 += start; 578 if (offset16 < start || offset16 >= limit) { 579 throw new ArrayIndexOutOfBoundsException(offset16); 580 } 581 char ch = source[offset16]; 582 if (isSurrogate(ch)) { 583 if (isLeadSurrogate(ch)) { 584 ++offset16; 585 if (offset16 < limit && isTrailSurrogate(source[offset16])) { 586 return LEAD_SURROGATE_BOUNDARY; 587 } 588 } else { // isTrailSurrogate(ch), so 589 --offset16; 590 if (offset16 >= start && isLeadSurrogate(source[offset16])) { 591 return TRAIL_SURROGATE_BOUNDARY; 592 } 593 } 594 } 595 return SINGLE_CHAR_BOUNDARY; 596 } 597 598 /** 599 * Determines whether the code point is a surrogate. 600 * 601 * @param codePoint The input character. 602 * (In ICU 2.1-69 the type of this parameter was <code>char</code>.) 603 * @return true If the input code point is a surrogate. 604 * @stable ICU 70 605 */ isSurrogate(int codePoint)606 public static boolean isSurrogate(int codePoint) { 607 return (codePoint & SURROGATE_BITMASK) == SURROGATE_BITS; 608 } 609 610 /** 611 * Determines whether the code point is a trail surrogate. 612 * 613 * @param codePoint The input character. 614 * (In ICU 2.1-69 the type of this parameter was <code>char</code>.) 615 * @return true If the input code point is a trail surrogate. 616 * @stable ICU 70 617 */ isTrailSurrogate(int codePoint)618 public static boolean isTrailSurrogate(int codePoint) { 619 return (codePoint & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; 620 } 621 622 /** 623 * Determines whether the code point is a lead surrogate. 624 * 625 * @param codePoint The input character. 626 * (In ICU 2.1-69 the type of this parameter was <code>char</code>.) 627 * @return true If the input code point is a lead surrogate 628 * @stable ICU 70 629 */ isLeadSurrogate(int codePoint)630 public static boolean isLeadSurrogate(int codePoint) { 631 return (codePoint & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; 632 } 633 634 /** 635 * Returns the lead surrogate. If a validity check is required, use 636 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 637 * before calling. 638 * 639 * @param char32 The input character. 640 * @return lead surrogate if the getCharCount(ch) is 2; <br> 641 * and 0 otherwise (note: 0 is not a valid lead surrogate). 642 * @stable ICU 2.1 643 */ getLeadSurrogate(int char32)644 public static char getLeadSurrogate(int char32) { 645 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 646 return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_)); 647 } 648 return 0; 649 } 650 651 /** 652 * Returns the trail surrogate. If a validity check is required, use 653 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 654 * before calling. 655 * 656 * @param char32 The input character. 657 * @return the trail surrogate if the getCharCount(ch) is 2; <br> 658 * otherwise the character itself 659 * @stable ICU 2.1 660 */ getTrailSurrogate(int char32)661 public static char getTrailSurrogate(int char32) { 662 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 663 return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_)); 664 } 665 return (char) char32; 666 } 667 668 /** 669 * Convenience method corresponding to String.valueOf(char). Returns a one or two char string 670 * containing the UTF-32 value in UTF16 format. If a validity check is required, use 671 * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before calling. 672 * 673 * @param char32 The input character. 674 * @return string value of char32 in UTF16 format 675 * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint. 676 * @stable ICU 2.1 677 */ valueOf(int char32)678 public static String valueOf(int char32) { 679 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 680 throw new IllegalArgumentException("Illegal codepoint"); 681 } 682 return toString(char32); 683 } 684 685 /** 686 * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or 687 * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate 688 * character, the whole supplementary codepoint will be returned. If a validity check is 689 * required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the 690 * codepoint at offset16 before calling. The result returned will be a newly created String 691 * obtained by calling source.substring(..) with the appropriate indexes. 692 * 693 * @param source The input string. 694 * @param offset16 The UTF16 index to the codepoint in source 695 * @return string value of char32 in UTF16 format 696 * @stable ICU 2.1 697 */ valueOf(String source, int offset16)698 public static String valueOf(String source, int offset16) { 699 switch (bounds(source, offset16)) { 700 case LEAD_SURROGATE_BOUNDARY: 701 return source.substring(offset16, offset16 + 2); 702 case TRAIL_SURROGATE_BOUNDARY: 703 return source.substring(offset16 - 1, offset16 + 1); 704 default: 705 return source.substring(offset16, offset16 + 1); 706 } 707 } 708 709 /** 710 * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a 711 * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a 712 * surrogate character, the whole supplementary codepoint will be returned. If a validity check 713 * is required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on 714 * the codepoint at offset16 before calling. The result returned will be a newly created String 715 * obtained by calling source.substring(..) with the appropriate indexes. 716 * 717 * @param source The input string buffer. 718 * @param offset16 The UTF16 index to the codepoint in source 719 * @return string value of char32 in UTF16 format 720 * @stable ICU 2.1 721 */ valueOf(StringBuffer source, int offset16)722 public static String valueOf(StringBuffer source, int offset16) { 723 switch (bounds(source, offset16)) { 724 case LEAD_SURROGATE_BOUNDARY: 725 return source.substring(offset16, offset16 + 2); 726 case TRAIL_SURROGATE_BOUNDARY: 727 return source.substring(offset16 - 1, offset16 + 1); 728 default: 729 return source.substring(offset16, offset16 + 1); 730 } 731 } 732 733 /** 734 * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16 735 * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be 736 * returned, except when either the leading or trailing surrogate character lies out of the 737 * specified subarray. In the latter case, only the surrogate character within bounds will be 738 * returned. If a validity check is required, use 739 * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the codepoint at 740 * offset16 before calling. The result returned will be a newly created String containing the 741 * relevant characters. 742 * 743 * @param source The input char array. 744 * @param start Start index of the subarray 745 * @param limit End index of the subarray 746 * @param offset16 The UTF16 index to the codepoint in source relative to start 747 * @return string value of char32 in UTF16 format 748 * @stable ICU 2.1 749 */ valueOf(char source[], int start, int limit, int offset16)750 public static String valueOf(char source[], int start, int limit, int offset16) { 751 switch (bounds(source, start, limit, offset16)) { 752 case LEAD_SURROGATE_BOUNDARY: 753 return new String(source, start + offset16, 2); 754 case TRAIL_SURROGATE_BOUNDARY: 755 return new String(source, start + offset16 - 1, 2); 756 } 757 return new String(source, start + offset16, 1); 758 } 759 760 /** 761 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 762 * the {@link UTF16 class description} for notes on roundtripping. 763 * 764 * @param source The UTF-16 string 765 * @param offset32 UTF-32 offset 766 * @return UTF-16 offset 767 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 768 * @stable ICU 2.1 769 */ findOffsetFromCodePoint(String source, int offset32)770 public static int findOffsetFromCodePoint(String source, int offset32) { 771 char ch; 772 int size = source.length(), result = 0, count = offset32; 773 if (offset32 < 0 || offset32 > size) { 774 throw new StringIndexOutOfBoundsException(offset32); 775 } 776 while (result < size && count > 0) { 777 ch = source.charAt(result); 778 if (isLeadSurrogate(ch) && ((result + 1) < size) 779 && isTrailSurrogate(source.charAt(result + 1))) { 780 result++; 781 } 782 783 count--; 784 result++; 785 } 786 if (count != 0) { 787 throw new StringIndexOutOfBoundsException(offset32); 788 } 789 return result; 790 } 791 792 /** 793 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 794 * the {@link UTF16 class description} for notes on roundtripping. 795 * 796 * @param source The UTF-16 string buffer 797 * @param offset32 UTF-32 offset 798 * @return UTF-16 offset 799 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 800 * @stable ICU 2.1 801 */ findOffsetFromCodePoint(StringBuffer source, int offset32)802 public static int findOffsetFromCodePoint(StringBuffer source, int offset32) { 803 char ch; 804 int size = source.length(), result = 0, count = offset32; 805 if (offset32 < 0 || offset32 > size) { 806 throw new StringIndexOutOfBoundsException(offset32); 807 } 808 while (result < size && count > 0) { 809 ch = source.charAt(result); 810 if (isLeadSurrogate(ch) && ((result + 1) < size) 811 && isTrailSurrogate(source.charAt(result + 1))) { 812 result++; 813 } 814 815 count--; 816 result++; 817 } 818 if (count != 0) { 819 throw new StringIndexOutOfBoundsException(offset32); 820 } 821 return result; 822 } 823 824 /** 825 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 826 * the {@link UTF16 class description} for notes on roundtripping. 827 * 828 * @param source The UTF-16 char array whose substring is to be analysed 829 * @param start Offset of the substring to be analysed 830 * @param limit Offset of the substring to be analysed 831 * @param offset32 UTF-32 offset relative to start 832 * @return UTF-16 offset relative to start 833 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 834 * @stable ICU 2.1 835 */ findOffsetFromCodePoint(char source[], int start, int limit, int offset32)836 public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) { 837 char ch; 838 int result = start, count = offset32; 839 if (offset32 > limit - start) { 840 throw new ArrayIndexOutOfBoundsException(offset32); 841 } 842 while (result < limit && count > 0) { 843 ch = source[result]; 844 if (isLeadSurrogate(ch) && ((result + 1) < limit) 845 && isTrailSurrogate(source[result + 1])) { 846 result++; 847 } 848 849 count--; 850 result++; 851 } 852 if (count != 0) { 853 throw new ArrayIndexOutOfBoundsException(offset32); 854 } 855 return result - start; 856 } 857 858 /** 859 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given 860 * UTF-16 offset. Used for random access. See the {@link UTF16 class description} for 861 * notes on roundtripping.<br> 862 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 863 * of the <strong>lead</strong> of the pair is returned. </i> 864 * <p> 865 * To find the UTF-32 length of a string, use: 866 * 867 * <pre> 868 * len32 = countCodePoint(source, source.length()); 869 * </pre> 870 * 871 * @param source Text to analyse 872 * @param offset16 UTF-16 offset < source text length. 873 * @return UTF-32 offset 874 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 875 * @stable ICU 2.1 876 */ findCodePointOffset(String source, int offset16)877 public static int findCodePointOffset(String source, int offset16) { 878 if (offset16 < 0 || offset16 > source.length()) { 879 throw new StringIndexOutOfBoundsException(offset16); 880 } 881 882 int result = 0; 883 char ch; 884 boolean hadLeadSurrogate = false; 885 886 for (int i = 0; i < offset16; ++i) { 887 ch = source.charAt(i); 888 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 889 hadLeadSurrogate = false; // count valid trail as zero 890 } else { 891 hadLeadSurrogate = isLeadSurrogate(ch); 892 ++result; // count others as 1 893 } 894 } 895 896 if (offset16 == source.length()) { 897 return result; 898 } 899 900 // end of source being the less significant surrogate character 901 // shift result back to the start of the supplementary character 902 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 903 result--; 904 } 905 906 return result; 907 } 908 909 /** 910 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 911 * offset. Used for random access. See the {@link UTF16 class description} for notes on 912 * roundtripping.<br> 913 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 914 * of the <strong>lead</strong> of the pair is returned. </i> 915 * <p> 916 * To find the UTF-32 length of a string, use: 917 * 918 * <pre> 919 * len32 = countCodePoint(source); 920 * </pre> 921 * 922 * @param source Text to analyse 923 * @param offset16 UTF-16 offset < source text length. 924 * @return UTF-32 offset 925 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 926 * @stable ICU 2.1 927 */ findCodePointOffset(StringBuffer source, int offset16)928 public static int findCodePointOffset(StringBuffer source, int offset16) { 929 if (offset16 < 0 || offset16 > source.length()) { 930 throw new StringIndexOutOfBoundsException(offset16); 931 } 932 933 int result = 0; 934 char ch; 935 boolean hadLeadSurrogate = false; 936 937 for (int i = 0; i < offset16; ++i) { 938 ch = source.charAt(i); 939 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 940 hadLeadSurrogate = false; // count valid trail as zero 941 } else { 942 hadLeadSurrogate = isLeadSurrogate(ch); 943 ++result; // count others as 1 944 } 945 } 946 947 if (offset16 == source.length()) { 948 return result; 949 } 950 951 // end of source being the less significant surrogate character 952 // shift result back to the start of the supplementary character 953 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 954 result--; 955 } 956 957 return result; 958 } 959 960 /** 961 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 962 * offset. Used for random access. See the {@link UTF16 class description} for notes on 963 * roundtripping.<br> 964 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 965 * of the <strong>lead</strong> of the pair is returned. </i> 966 * <p> 967 * To find the UTF-32 length of a substring, use: 968 * 969 * <pre> 970 * len32 = countCodePoint(source, start, limit); 971 * </pre> 972 * 973 * @param source Text to analyse 974 * @param start Offset of the substring 975 * @param limit Offset of the substring 976 * @param offset16 UTF-16 relative to start 977 * @return UTF-32 offset relative to start 978 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 979 * @stable ICU 2.1 980 */ findCodePointOffset(char source[], int start, int limit, int offset16)981 public static int findCodePointOffset(char source[], int start, int limit, int offset16) { 982 offset16 += start; 983 if (offset16 > limit) { 984 throw new StringIndexOutOfBoundsException(offset16); 985 } 986 987 int result = 0; 988 char ch; 989 boolean hadLeadSurrogate = false; 990 991 for (int i = start; i < offset16; ++i) { 992 ch = source[i]; 993 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 994 hadLeadSurrogate = false; // count valid trail as zero 995 } else { 996 hadLeadSurrogate = isLeadSurrogate(ch); 997 ++result; // count others as 1 998 } 999 } 1000 1001 if (offset16 == limit) { 1002 return result; 1003 } 1004 1005 // end of source being the less significant surrogate character 1006 // shift result back to the start of the supplementary character 1007 if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) { 1008 result--; 1009 } 1010 1011 return result; 1012 } 1013 1014 /** 1015 * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required, 1016 * use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before 1017 * calling. 1018 * 1019 * @param target The buffer to append to 1020 * @param char32 Value to append. 1021 * @return the updated StringBuffer 1022 * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints 1023 * @stable ICU 2.1 1024 */ append(StringBuffer target, int char32)1025 public static StringBuffer append(StringBuffer target, int char32) { 1026 // Check for irregular values 1027 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1028 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); 1029 } 1030 1031 // Write the UTF-16 values 1032 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 1033 target.append(getLeadSurrogate(char32)); 1034 target.append(getTrailSurrogate(char32)); 1035 } else { 1036 target.append((char) char32); 1037 } 1038 return target; 1039 } 1040 1041 /** 1042 * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a 1043 * convenience. 1044 * 1045 * @param target The buffer to append to 1046 * @param cp The code point to append 1047 * @return the updated StringBuffer 1048 * @throws IllegalArgumentException If cp is not a valid code point 1049 * @stable ICU 3.0 1050 */ appendCodePoint(StringBuffer target, int cp)1051 public static StringBuffer appendCodePoint(StringBuffer target, int cp) { 1052 return append(target, cp); 1053 } 1054 1055 /** 1056 * Adds a codepoint to offset16 position of the argument char array. 1057 * 1058 * @param target Char array to be append with the new code point 1059 * @param limit UTF16 offset which the codepoint will be appended. 1060 * @param char32 Code point to be appended 1061 * @return offset after char32 in the array. 1062 * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not 1063 * lie within the range of the Unicode codepoints. 1064 * @stable ICU 2.1 1065 */ append(char[] target, int limit, int char32)1066 public static int append(char[] target, int limit, int char32) { 1067 // Check for irregular values 1068 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1069 throw new IllegalArgumentException("Illegal codepoint"); 1070 } 1071 // Write the UTF-16 values 1072 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 1073 target[limit++] = getLeadSurrogate(char32); 1074 target[limit++] = getTrailSurrogate(char32); 1075 } else { 1076 target[limit++] = (char) char32; 1077 } 1078 return limit; 1079 } 1080 1081 /** 1082 * Number of codepoints in a UTF16 String 1083 * 1084 * @param source UTF16 string 1085 * @return number of codepoint in string 1086 * @stable ICU 2.1 1087 */ countCodePoint(String source)1088 public static int countCodePoint(String source) { 1089 if (source == null || source.length() == 0) { 1090 return 0; 1091 } 1092 return findCodePointOffset(source, source.length()); 1093 } 1094 1095 /** 1096 * Number of codepoints in a UTF16 String buffer 1097 * 1098 * @param source UTF16 string buffer 1099 * @return number of codepoint in string 1100 * @stable ICU 2.1 1101 */ countCodePoint(StringBuffer source)1102 public static int countCodePoint(StringBuffer source) { 1103 if (source == null || source.length() == 0) { 1104 return 0; 1105 } 1106 return findCodePointOffset(source, source.length()); 1107 } 1108 1109 /** 1110 * Number of codepoints in a UTF16 char array substring 1111 * 1112 * @param source UTF16 char array 1113 * @param start Offset of the substring 1114 * @param limit Offset of the substring 1115 * @return number of codepoint in the substring 1116 * @exception IndexOutOfBoundsException If start and limit are not valid. 1117 * @stable ICU 2.1 1118 */ countCodePoint(char source[], int start, int limit)1119 public static int countCodePoint(char source[], int start, int limit) { 1120 if (source == null || source.length == 0) { 1121 return 0; 1122 } 1123 return findCodePointOffset(source, start, limit, limit - start); 1124 } 1125 1126 /** 1127 * Set a code point into a UTF16 position. Adjusts target according if we are replacing a 1128 * non-supplementary codepoint with a supplementary and vice versa. 1129 * 1130 * @param target Stringbuffer 1131 * @param offset16 UTF16 position to insert into 1132 * @param char32 Code point 1133 * @stable ICU 2.1 1134 */ setCharAt(StringBuffer target, int offset16, int char32)1135 public static void setCharAt(StringBuffer target, int offset16, int char32) { 1136 int count = 1; 1137 char single = target.charAt(offset16); 1138 1139 if (isSurrogate(single)) { 1140 // pairs of the surrogate with offset16 at the lead char found 1141 if (isLeadSurrogate(single) && (target.length() > offset16 + 1) 1142 && isTrailSurrogate(target.charAt(offset16 + 1))) { 1143 count++; 1144 } else { 1145 // pairs of the surrogate with offset16 at the trail char 1146 // found 1147 if (isTrailSurrogate(single) && (offset16 > 0) 1148 && isLeadSurrogate(target.charAt(offset16 - 1))) { 1149 offset16--; 1150 count++; 1151 } 1152 } 1153 } 1154 target.replace(offset16, offset16 + count, valueOf(char32)); 1155 } 1156 1157 /** 1158 * Set a code point into a UTF16 position in a char array. Adjusts target according if we are 1159 * replacing a non-supplementary codepoint with a supplementary and vice versa. 1160 * 1161 * @param target char array 1162 * @param limit numbers of valid chars in target, different from target.length. limit counts the 1163 * number of chars in target that represents a string, not the size of array target. 1164 * @param offset16 UTF16 position to insert into 1165 * @param char32 code point 1166 * @return new number of chars in target that represents a string 1167 * @exception IndexOutOfBoundsException if offset16 is out of range 1168 * @stable ICU 2.1 1169 */ setCharAt(char target[], int limit, int offset16, int char32)1170 public static int setCharAt(char target[], int limit, int offset16, int char32) { 1171 if (offset16 >= limit) { 1172 throw new ArrayIndexOutOfBoundsException(offset16); 1173 } 1174 int count = 1; 1175 char single = target[offset16]; 1176 1177 if (isSurrogate(single)) { 1178 // pairs of the surrogate with offset16 at the lead char found 1179 if (isLeadSurrogate(single) && (target.length > offset16 + 1) 1180 && isTrailSurrogate(target[offset16 + 1])) { 1181 count++; 1182 } else { 1183 // pairs of the surrogate with offset16 at the trail char 1184 // found 1185 if (isTrailSurrogate(single) && (offset16 > 0) 1186 && isLeadSurrogate(target[offset16 - 1])) { 1187 offset16--; 1188 count++; 1189 } 1190 } 1191 } 1192 1193 String str = valueOf(char32); 1194 int result = limit; 1195 int strlength = str.length(); 1196 target[offset16] = str.charAt(0); 1197 if (count == strlength) { 1198 if (count == 2) { 1199 target[offset16 + 1] = str.charAt(1); 1200 } 1201 } else { 1202 // this is not exact match in space, we'll have to do some 1203 // shifting 1204 System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit 1205 - (offset16 + count)); 1206 if (count < strlength) { 1207 // char32 is a supplementary character trying to squeeze into 1208 // a non-supplementary space 1209 target[offset16 + 1] = str.charAt(1); 1210 result++; 1211 if (result < target.length) { 1212 target[result] = 0; 1213 } 1214 } else { 1215 // char32 is a non-supplementary character trying to fill 1216 // into a supplementary space 1217 result--; 1218 target[result] = 0; 1219 } 1220 } 1221 return result; 1222 } 1223 1224 /** 1225 * Shifts offset16 by the argument number of codepoints 1226 * 1227 * @param source string 1228 * @param offset16 UTF16 position to shift 1229 * @param shift32 number of codepoints to shift 1230 * @return new shifted offset16 1231 * @exception IndexOutOfBoundsException if the new offset16 is out of bounds. 1232 * @stable ICU 2.1 1233 */ moveCodePointOffset(String source, int offset16, int shift32)1234 public static int moveCodePointOffset(String source, int offset16, int shift32) { 1235 int result = offset16; 1236 int size = source.length(); 1237 int count; 1238 char ch; 1239 if (offset16 < 0 || offset16 > size) { 1240 throw new StringIndexOutOfBoundsException(offset16); 1241 } 1242 if (shift32 > 0) { 1243 if (shift32 + offset16 > size) { 1244 throw new StringIndexOutOfBoundsException(offset16); 1245 } 1246 count = shift32; 1247 while (result < size && count > 0) { 1248 ch = source.charAt(result); 1249 if (isLeadSurrogate(ch) && ((result + 1) < size) 1250 && isTrailSurrogate(source.charAt(result + 1))) { 1251 result++; 1252 } 1253 count--; 1254 result++; 1255 } 1256 } else { 1257 if (offset16 + shift32 < 0) { 1258 throw new StringIndexOutOfBoundsException(offset16); 1259 } 1260 for (count = -shift32; count > 0; count--) { 1261 result--; 1262 if (result < 0) { 1263 break; 1264 } 1265 ch = source.charAt(result); 1266 if (isTrailSurrogate(ch) && result > 0 1267 && isLeadSurrogate(source.charAt(result - 1))) { 1268 result--; 1269 } 1270 } 1271 } 1272 if (count != 0) { 1273 throw new StringIndexOutOfBoundsException(shift32); 1274 } 1275 return result; 1276 } 1277 1278 /** 1279 * Shifts offset16 by the argument number of codepoints 1280 * 1281 * @param source String buffer 1282 * @param offset16 UTF16 position to shift 1283 * @param shift32 Number of codepoints to shift 1284 * @return new shifted offset16 1285 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds. 1286 * @stable ICU 2.1 1287 */ moveCodePointOffset(StringBuffer source, int offset16, int shift32)1288 public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) { 1289 int result = offset16; 1290 int size = source.length(); 1291 int count; 1292 char ch; 1293 if (offset16 < 0 || offset16 > size) { 1294 throw new StringIndexOutOfBoundsException(offset16); 1295 } 1296 if (shift32 > 0) { 1297 if (shift32 + offset16 > size) { 1298 throw new StringIndexOutOfBoundsException(offset16); 1299 } 1300 count = shift32; 1301 while (result < size && count > 0) { 1302 ch = source.charAt(result); 1303 if (isLeadSurrogate(ch) && ((result + 1) < size) 1304 && isTrailSurrogate(source.charAt(result + 1))) { 1305 result++; 1306 } 1307 count--; 1308 result++; 1309 } 1310 } else { 1311 if (offset16 + shift32 < 0) { 1312 throw new StringIndexOutOfBoundsException(offset16); 1313 } 1314 for (count = -shift32; count > 0; count--) { 1315 result--; 1316 if (result < 0) { 1317 break; 1318 } 1319 ch = source.charAt(result); 1320 if (isTrailSurrogate(ch) && result > 0 1321 && isLeadSurrogate(source.charAt(result - 1))) { 1322 result--; 1323 } 1324 } 1325 } 1326 if (count != 0) { 1327 throw new StringIndexOutOfBoundsException(shift32); 1328 } 1329 return result; 1330 } 1331 1332 /** 1333 * Shifts offset16 by the argument number of codepoints within a subarray. 1334 * 1335 * @param source Char array 1336 * @param start Position of the subarray to be performed on 1337 * @param limit Position of the subarray to be performed on 1338 * @param offset16 UTF16 position to shift relative to start 1339 * @param shift32 Number of codepoints to shift 1340 * @return new shifted offset16 relative to start 1341 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the 1342 * subarray bounds are out of range. 1343 * @stable ICU 2.1 1344 */ moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32)1345 public static int moveCodePointOffset(char source[], int start, int limit, int offset16, 1346 int shift32) { 1347 int size = source.length; 1348 int count; 1349 char ch; 1350 int result = offset16 + start; 1351 if (start < 0 || limit < start) { 1352 throw new StringIndexOutOfBoundsException(start); 1353 } 1354 if (limit > size) { 1355 throw new StringIndexOutOfBoundsException(limit); 1356 } 1357 if (offset16 < 0 || result > limit) { 1358 throw new StringIndexOutOfBoundsException(offset16); 1359 } 1360 if (shift32 > 0) { 1361 if (shift32 + result > size) { 1362 throw new StringIndexOutOfBoundsException(result); 1363 } 1364 count = shift32; 1365 while (result < limit && count > 0) { 1366 ch = source[result]; 1367 if (isLeadSurrogate(ch) && (result + 1 < limit) 1368 && isTrailSurrogate(source[result + 1])) { 1369 result++; 1370 } 1371 count--; 1372 result++; 1373 } 1374 } else { 1375 if (result + shift32 < start) { 1376 throw new StringIndexOutOfBoundsException(result); 1377 } 1378 for (count = -shift32; count > 0; count--) { 1379 result--; 1380 if (result < start) { 1381 break; 1382 } 1383 ch = source[result]; 1384 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { 1385 result--; 1386 } 1387 } 1388 } 1389 if (count != 0) { 1390 throw new StringIndexOutOfBoundsException(shift32); 1391 } 1392 result -= start; 1393 return result; 1394 } 1395 1396 /** 1397 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1398 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1399 * codepoint. The length of target increases by one if codepoint is non-supplementary, 2 1400 * otherwise. 1401 * <p> 1402 * The overall effect is exactly as if the argument were converted to a string by the method 1403 * valueOf(char) and the characters in that string were then inserted into target at the 1404 * position indicated by offset16. 1405 * </p> 1406 * <p> 1407 * The offset argument must be greater than or equal to 0, and less than or equal to the length 1408 * of source. 1409 * 1410 * @param target String buffer to insert to 1411 * @param offset16 Offset which char32 will be inserted in 1412 * @param char32 Codepoint to be inserted 1413 * @return a reference to target 1414 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1415 * @stable ICU 2.1 1416 */ insert(StringBuffer target, int offset16, int char32)1417 public static StringBuffer insert(StringBuffer target, int offset16, int char32) { 1418 String str = valueOf(char32); 1419 if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1420 offset16++; 1421 } 1422 target.insert(offset16, str); 1423 return target; 1424 } 1425 1426 /** 1427 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1428 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1429 * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise. 1430 * <p> 1431 * The overall effect is exactly as if the argument were converted to a string by the method 1432 * valueOf(char) and the characters in that string were then inserted into target at the 1433 * position indicated by offset16. 1434 * </p> 1435 * <p> 1436 * The offset argument must be greater than or equal to 0, and less than or equal to the limit. 1437 * 1438 * @param target Char array to insert to 1439 * @param limit End index of the char array, limit <= target.length 1440 * @param offset16 Offset which char32 will be inserted in 1441 * @param char32 Codepoint to be inserted 1442 * @return new limit size 1443 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1444 * @stable ICU 2.1 1445 */ insert(char target[], int limit, int offset16, int char32)1446 public static int insert(char target[], int limit, int offset16, int char32) { 1447 String str = valueOf(char32); 1448 if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1449 offset16++; 1450 } 1451 int size = str.length(); 1452 if (limit + size > target.length) { 1453 throw new ArrayIndexOutOfBoundsException(offset16 + size); 1454 } 1455 System.arraycopy(target, offset16, target, offset16 + size, limit - offset16); 1456 target[offset16] = str.charAt(0); 1457 if (size == 2) { 1458 target[offset16 + 1] = str.charAt(1); 1459 } 1460 return limit + size; 1461 } 1462 1463 /** 1464 * Removes the codepoint at the specified position in this target (shortening target by 1 1465 * character if the codepoint is a non-supplementary, 2 otherwise). 1466 * 1467 * @param target String buffer to remove codepoint from 1468 * @param offset16 Offset which the codepoint will be removed 1469 * @return a reference to target 1470 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1471 * @stable ICU 2.1 1472 */ delete(StringBuffer target, int offset16)1473 public static StringBuffer delete(StringBuffer target, int offset16) { 1474 int count = 1; 1475 switch (bounds(target, offset16)) { 1476 case LEAD_SURROGATE_BOUNDARY: 1477 count++; 1478 break; 1479 case TRAIL_SURROGATE_BOUNDARY: 1480 count++; 1481 offset16--; 1482 break; 1483 } 1484 target.delete(offset16, offset16 + count); 1485 return target; 1486 } 1487 1488 /** 1489 * Removes the codepoint at the specified position in this target (shortening target by 1 1490 * character if the codepoint is a non-supplementary, 2 otherwise). 1491 * 1492 * @param target String buffer to remove codepoint from 1493 * @param limit End index of the char array, limit <= target.length 1494 * @param offset16 Offset which the codepoint will be removed 1495 * @return a new limit size 1496 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1497 * @stable ICU 2.1 1498 */ delete(char target[], int limit, int offset16)1499 public static int delete(char target[], int limit, int offset16) { 1500 int count = 1; 1501 switch (bounds(target, 0, limit, offset16)) { 1502 case LEAD_SURROGATE_BOUNDARY: 1503 count++; 1504 break; 1505 case TRAIL_SURROGATE_BOUNDARY: 1506 count++; 1507 offset16--; 1508 break; 1509 } 1510 System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count)); 1511 target[limit - count] = 0; 1512 return limit - count; 1513 } 1514 1515 /** 1516 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1517 * the argument codepoint. I.e., the smallest index <code>i</code> such that 1518 * <code>UTF16.charAt(source, i) == 1519 * char32</code> is true. 1520 * <p> 1521 * If no such character occurs in this string, then -1 is returned. 1522 * </p> 1523 * <p> 1524 * Examples:<br> 1525 * UTF16.indexOf("abc", 'a') returns 0<br> 1526 * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1527 * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1528 * </p> 1529 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1530 * characters to its fullest. 1531 * 1532 * @param source UTF16 format Unicode string that will be searched 1533 * @param char32 Codepoint to search for 1534 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1535 * -1 if the codepoint does not occur. 1536 * @stable ICU 2.6 1537 */ indexOf(String source, int char32)1538 public static int indexOf(String source, int char32) { 1539 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1540 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1541 } 1542 // non-surrogate bmp 1543 if (char32 < LEAD_SURROGATE_MIN_VALUE 1544 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1545 return source.indexOf((char) char32); 1546 } 1547 // surrogate 1548 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1549 int result = source.indexOf((char) char32); 1550 if (result >= 0) { 1551 if (isLeadSurrogate(char32) && (result < source.length() - 1) 1552 && isTrailSurrogate(source.charAt(result + 1))) { 1553 return indexOf(source, char32, result + 1); 1554 } 1555 // trail surrogate 1556 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1557 return indexOf(source, char32, result + 1); 1558 } 1559 } 1560 return result; 1561 } 1562 // supplementary 1563 String char32str = toString(char32); 1564 return source.indexOf(char32str); 1565 } 1566 1567 /** 1568 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1569 * the argument string str. This method is implemented based on codepoints, hence a "lead 1570 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1571 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1572 * character before str found at in source will not have a valid match. Vice versa for lead 1573 * surrogates that ends str. See example below. 1574 * <p> 1575 * If no such string str occurs in this source, then -1 is returned. 1576 * </p> 1577 * <p> 1578 * Examples:<br> 1579 * UTF16.indexOf("abc", "ab") returns 0<br> 1580 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1581 * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1582 * </p> 1583 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1584 * characters to its fullest. 1585 * 1586 * @param source UTF16 format Unicode string that will be searched 1587 * @param str UTF16 format Unicode string to search for 1588 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1589 * -1 if the codepoint does not occur. 1590 * @stable ICU 2.6 1591 */ indexOf(String source, String str)1592 public static int indexOf(String source, String str) { 1593 int strLength = str.length(); 1594 // non-surrogate ends 1595 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1596 return source.indexOf(str); 1597 } 1598 1599 int result = source.indexOf(str); 1600 int resultEnd = result + strLength; 1601 if (result >= 0) { 1602 // check last character 1603 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1604 && isTrailSurrogate(source.charAt(resultEnd + 1))) { 1605 return indexOf(source, str, resultEnd + 1); 1606 } 1607 // check first character which is a trail surrogate 1608 if (isTrailSurrogate(str.charAt(0)) && result > 0 1609 && isLeadSurrogate(source.charAt(result - 1))) { 1610 return indexOf(source, str, resultEnd + 1); 1611 } 1612 } 1613 return result; 1614 } 1615 1616 /** 1617 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1618 * the argument codepoint. I.e., the smallest index i such that: <br> 1619 * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true. 1620 * <p> 1621 * If no such character occurs in this string, then -1 is returned. 1622 * </p> 1623 * <p> 1624 * Examples:<br> 1625 * UTF16.indexOf("abc", 'a', 1) returns -1<br> 1626 * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br> 1627 * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br> 1628 * </p> 1629 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1630 * characters to its fullest. 1631 * 1632 * @param source UTF16 format Unicode string that will be searched 1633 * @param char32 Codepoint to search for 1634 * @param fromIndex The index to start the search from. 1635 * @return the index of the first occurrence of the codepoint in the argument Unicode string at 1636 * or after fromIndex, or -1 if the codepoint does not occur. 1637 * @stable ICU 2.6 1638 */ indexOf(String source, int char32, int fromIndex)1639 public static int indexOf(String source, int char32, int fromIndex) { 1640 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1641 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1642 } 1643 // non-surrogate bmp 1644 if (char32 < LEAD_SURROGATE_MIN_VALUE 1645 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1646 return source.indexOf((char) char32, fromIndex); 1647 } 1648 // surrogate 1649 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1650 int result = source.indexOf((char) char32, fromIndex); 1651 if (result >= 0) { 1652 if (isLeadSurrogate(char32) && (result < source.length() - 1) 1653 && isTrailSurrogate(source.charAt(result + 1))) { 1654 return indexOf(source, char32, result + 1); 1655 } 1656 // trail surrogate 1657 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1658 return indexOf(source, char32, result + 1); 1659 } 1660 } 1661 return result; 1662 } 1663 // supplementary 1664 String char32str = toString(char32); 1665 return source.indexOf(char32str, fromIndex); 1666 } 1667 1668 /** 1669 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1670 * the argument string str. This method is implemented based on codepoints, hence a "lead 1671 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1672 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1673 * character before str found at in source will not have a valid match. Vice versa for lead 1674 * surrogates that ends str. See example below. 1675 * <p> 1676 * If no such string str occurs in this source, then -1 is returned. 1677 * </p> 1678 * <p> 1679 * Examples:<br> 1680 * UTF16.indexOf("abc", "ab", 0) returns 0<br> 1681 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br> 1682 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br> 1683 * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br> 1684 * </p> 1685 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1686 * characters to its fullest. 1687 * 1688 * @param source UTF16 format Unicode string that will be searched 1689 * @param str UTF16 format Unicode string to search for 1690 * @param fromIndex The index to start the search from. 1691 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1692 * -1 if the codepoint does not occur. 1693 * @stable ICU 2.6 1694 */ indexOf(String source, String str, int fromIndex)1695 public static int indexOf(String source, String str, int fromIndex) { 1696 int strLength = str.length(); 1697 // non-surrogate ends 1698 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1699 return source.indexOf(str, fromIndex); 1700 } 1701 1702 int result = source.indexOf(str, fromIndex); 1703 int resultEnd = result + strLength; 1704 if (result >= 0) { 1705 // check last character 1706 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1707 && isTrailSurrogate(source.charAt(resultEnd))) { 1708 return indexOf(source, str, resultEnd + 1); 1709 } 1710 // check first character which is a trail surrogate 1711 if (isTrailSurrogate(str.charAt(0)) && result > 0 1712 && isLeadSurrogate(source.charAt(result - 1))) { 1713 return indexOf(source, str, resultEnd + 1); 1714 } 1715 } 1716 return result; 1717 } 1718 1719 /** 1720 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1721 * the argument codepoint. I.e., the index returned is the largest value i such that: 1722 * UTF16.charAt(source, i) == char32 is true. 1723 * <p> 1724 * Examples:<br> 1725 * UTF16.lastIndexOf("abc", 'a') returns 0<br> 1726 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1727 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1728 * </p> 1729 * <p> 1730 * source is searched backwards starting at the last character. 1731 * </p> 1732 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1733 * characters to its fullest. 1734 * 1735 * @param source UTF16 format Unicode string that will be searched 1736 * @param char32 Codepoint to search for 1737 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1738 * does not occur. 1739 * @stable ICU 2.6 1740 */ lastIndexOf(String source, int char32)1741 public static int lastIndexOf(String source, int char32) { 1742 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1743 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1744 } 1745 // non-surrogate bmp 1746 if (char32 < LEAD_SURROGATE_MIN_VALUE 1747 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1748 return source.lastIndexOf((char) char32); 1749 } 1750 // surrogate 1751 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1752 int result = source.lastIndexOf((char) char32); 1753 if (result >= 0) { 1754 if (isLeadSurrogate(char32) && (result < source.length() - 1) 1755 && isTrailSurrogate(source.charAt(result + 1))) { 1756 return lastIndexOf(source, char32, result - 1); 1757 } 1758 // trail surrogate 1759 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1760 return lastIndexOf(source, char32, result - 1); 1761 } 1762 } 1763 return result; 1764 } 1765 // supplementary 1766 String char32str = toString(char32); 1767 return source.lastIndexOf(char32str); 1768 } 1769 1770 /** 1771 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1772 * the argument string str. This method is implemented based on codepoints, hence a "lead 1773 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1774 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1775 * character before str found at in source will not have a valid match. Vice versa for lead 1776 * surrogates that ends str. See example below. 1777 * <p> 1778 * Examples:<br> 1779 * UTF16.lastIndexOf("abc", "a") returns 0<br> 1780 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1781 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1782 * </p> 1783 * <p> 1784 * source is searched backwards starting at the last character. 1785 * </p> 1786 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1787 * characters to its fullest. 1788 * 1789 * @param source UTF16 format Unicode string that will be searched 1790 * @param str UTF16 format Unicode string to search for 1791 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1792 * does not occur. 1793 * @stable ICU 2.6 1794 */ lastIndexOf(String source, String str)1795 public static int lastIndexOf(String source, String str) { 1796 int strLength = str.length(); 1797 // non-surrogate ends 1798 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1799 return source.lastIndexOf(str); 1800 } 1801 1802 int result = source.lastIndexOf(str); 1803 if (result >= 0) { 1804 // check last character 1805 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1806 && isTrailSurrogate(source.charAt(result + strLength + 1))) { 1807 return lastIndexOf(source, str, result - 1); 1808 } 1809 // check first character which is a trail surrogate 1810 if (isTrailSurrogate(str.charAt(0)) && result > 0 1811 && isLeadSurrogate(source.charAt(result - 1))) { 1812 return lastIndexOf(source, str, result - 1); 1813 } 1814 } 1815 return result; 1816 } 1817 1818 /** 1819 * <p> 1820 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1821 * the argument codepoint, where the result is less than or equals to fromIndex. 1822 * </p> 1823 * <p> 1824 * This method is implemented based on codepoints, hence a single surrogate character will not 1825 * match a supplementary character. 1826 * </p> 1827 * <p> 1828 * source is searched backwards starting at the last character starting at the specified index. 1829 * </p> 1830 * <p> 1831 * Examples:<br> 1832 * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br> 1833 * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br> 1834 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br> 1835 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br> 1836 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1837 * </p> 1838 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1839 * characters to its fullest. 1840 * 1841 * @param source UTF16 format Unicode string that will be searched 1842 * @param char32 Codepoint to search for 1843 * @param fromIndex the index to start the search from. There is no restriction on the value of 1844 * fromIndex. If it is greater than or equal to the length of this string, it has the 1845 * same effect as if it were equal to one less than the length of this string: this 1846 * entire string may be searched. If it is negative, it has the same effect as if it 1847 * were -1: -1 is returned. 1848 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1849 * does not occur. 1850 * @stable ICU 2.6 1851 */ lastIndexOf(String source, int char32, int fromIndex)1852 public static int lastIndexOf(String source, int char32, int fromIndex) { 1853 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1854 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1855 } 1856 // non-surrogate bmp 1857 if (char32 < LEAD_SURROGATE_MIN_VALUE 1858 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1859 return source.lastIndexOf((char) char32, fromIndex); 1860 } 1861 // surrogate 1862 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1863 int result = source.lastIndexOf((char) char32, fromIndex); 1864 if (result >= 0) { 1865 if (isLeadSurrogate(char32) && (result < source.length() - 1) 1866 && isTrailSurrogate(source.charAt(result + 1))) { 1867 return lastIndexOf(source, char32, result - 1); 1868 } 1869 // trail surrogate 1870 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1871 return lastIndexOf(source, char32, result - 1); 1872 } 1873 } 1874 return result; 1875 } 1876 // supplementary 1877 String char32str = toString(char32); 1878 return source.lastIndexOf(char32str, fromIndex); 1879 } 1880 1881 /** 1882 * <p> 1883 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1884 * the argument string str, where the result is less than or equals to fromIndex. 1885 * </p> 1886 * <p> 1887 * This method is implemented based on codepoints, hence a "lead surrogate character + trail 1888 * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate 1889 * character at index 0, a source with a leading a surrogate character before str found at in 1890 * source will not have a valid match. Vice versa for lead surrogates that ends str. 1891 * </p> 1892 * See example below. 1893 * <p> 1894 * Examples:<br> 1895 * UTF16.lastIndexOf("abc", "c", 2) returns 2<br> 1896 * UTF16.lastIndexOf("abc", "c", 1) returns -1<br> 1897 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br> 1898 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br> 1899 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br> 1900 * </p> 1901 * <p> 1902 * source is searched backwards starting at the last character. 1903 * </p> 1904 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1905 * characters to its fullest. 1906 * 1907 * @param source UTF16 format Unicode string that will be searched 1908 * @param str UTF16 format Unicode string to search for 1909 * @param fromIndex the index to start the search from. There is no restriction on the value of 1910 * fromIndex. If it is greater than or equal to the length of this string, it has the 1911 * same effect as if it were equal to one less than the length of this string: this 1912 * entire string may be searched. If it is negative, it has the same effect as if it 1913 * were -1: -1 is returned. 1914 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1915 * does not occur. 1916 * @stable ICU 2.6 1917 */ lastIndexOf(String source, String str, int fromIndex)1918 public static int lastIndexOf(String source, String str, int fromIndex) { 1919 int strLength = str.length(); 1920 // non-surrogate ends 1921 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1922 return source.lastIndexOf(str, fromIndex); 1923 } 1924 1925 int result = source.lastIndexOf(str, fromIndex); 1926 if (result >= 0) { 1927 // check last character 1928 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1929 && isTrailSurrogate(source.charAt(result + strLength))) { 1930 return lastIndexOf(source, str, result - 1); 1931 } 1932 // check first character which is a trail surrogate 1933 if (isTrailSurrogate(str.charAt(0)) && result > 0 1934 && isLeadSurrogate(source.charAt(result - 1))) { 1935 return lastIndexOf(source, str, result - 1); 1936 } 1937 } 1938 return result; 1939 } 1940 1941 /** 1942 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of 1943 * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16 1944 * format Unicode string source, then source will be returned. Otherwise, a new String object is 1945 * created that represents a codepoint sequence identical to the codepoint sequence represented 1946 * by source, except that every occurrence of oldChar32 is replaced by an occurrence of 1947 * newChar32. 1948 * <p> 1949 * Examples: <br> 1950 * UTF16.replace("mesquite in your cellar", 'e', 'o');<br> 1951 * returns "mosquito in your collar"<br> 1952 * UTF16.replace("JonL", 'q', 'x');<br> 1953 * returns "JonL" (no change)<br> 1954 * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br> 1955 * returns "Supplementary character !"<br> 1956 * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br> 1957 * returns "Supplementary character \ud800\udc00"<br> 1958 * </p> 1959 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1960 * characters to its fullest. 1961 * 1962 * @param source UTF16 format Unicode string which the codepoint replacements will be based on. 1963 * @param oldChar32 Non-zero old codepoint to be replaced. 1964 * @param newChar32 The new codepoint to replace oldChar32 1965 * @return new String derived from source by replacing every occurrence of oldChar32 with 1966 * newChar32, unless when no oldChar32 is found in source then source will be returned. 1967 * @stable ICU 2.6 1968 */ replace(String source, int oldChar32, int newChar32)1969 public static String replace(String source, int oldChar32, int newChar32) { 1970 if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) { 1971 throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint"); 1972 } 1973 if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) { 1974 throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint"); 1975 } 1976 1977 int index = indexOf(source, oldChar32); 1978 if (index == -1) { 1979 return source; 1980 } 1981 String newChar32Str = toString(newChar32); 1982 int oldChar32Size = 1; 1983 int newChar32Size = newChar32Str.length(); 1984 StringBuffer result = new StringBuffer(source); 1985 int resultIndex = index; 1986 1987 if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) { 1988 oldChar32Size = 2; 1989 } 1990 1991 while (index != -1) { 1992 int endResultIndex = resultIndex + oldChar32Size; 1993 result.replace(resultIndex, endResultIndex, newChar32Str); 1994 int lastEndIndex = index + oldChar32Size; 1995 index = indexOf(source, oldChar32, lastEndIndex); 1996 resultIndex += newChar32Size + index - lastEndIndex; 1997 } 1998 return result.toString(); 1999 } 2000 2001 /** 2002 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr 2003 * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string 2004 * source, then source will be returned. Otherwise, a new String object is created that 2005 * represents a codepoint sequence identical to the codepoint sequence represented by source, 2006 * except that every occurrence of oldStr is replaced by an occurrence of newStr. 2007 * <p> 2008 * Examples: <br> 2009 * UTF16.replace("mesquite in your cellar", "e", "o");<br> 2010 * returns "mosquito in your collar"<br> 2011 * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br> 2012 * returns "cat in your cellar"<br> 2013 * UTF16.replace("JonL", "q", "x");<br> 2014 * returns "JonL" (no change)<br> 2015 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br> 2016 * returns "Supplementary character !"<br> 2017 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br> 2018 * returns "Supplementary character \ud800\udc00"<br> 2019 * </p> 2020 * Note this method is provided as support to jdk 1.3, which does not support supplementary 2021 * characters to its fullest. 2022 * 2023 * @param source UTF16 format Unicode string which the replacements will be based on. 2024 * @param oldStr Non-zero-length string to be replaced. 2025 * @param newStr The new string to replace oldStr 2026 * @return new String derived from source by replacing every occurrence of oldStr with newStr. 2027 * When no oldStr is found in source, then source will be returned. 2028 * @stable ICU 2.6 2029 */ replace(String source, String oldStr, String newStr)2030 public static String replace(String source, String oldStr, String newStr) { 2031 int index = indexOf(source, oldStr); 2032 if (index == -1) { 2033 return source; 2034 } 2035 int oldStrSize = oldStr.length(); 2036 int newStrSize = newStr.length(); 2037 StringBuffer result = new StringBuffer(source); 2038 int resultIndex = index; 2039 2040 while (index != -1) { 2041 int endResultIndex = resultIndex + oldStrSize; 2042 result.replace(resultIndex, endResultIndex, newStr); 2043 int lastEndIndex = index + oldStrSize; 2044 index = indexOf(source, oldStr, lastEndIndex); 2045 resultIndex += newStrSize + index - lastEndIndex; 2046 } 2047 return result.toString(); 2048 } 2049 2050 /** 2051 * Reverses a UTF16 format Unicode string and replaces source's content with it. This method 2052 * will reverse surrogate characters correctly, instead of blindly reversing every character. 2053 * <p> 2054 * Examples:<br> 2055 * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br> 2056 * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS". 2057 * 2058 * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed 2059 * @return a modified source with reversed UTF16 format Unicode string. 2060 * @stable ICU 2.6 2061 */ reverse(StringBuffer source)2062 public static StringBuffer reverse(StringBuffer source) { 2063 int length = source.length(); 2064 StringBuffer result = new StringBuffer(length); 2065 for (int i = length; i-- > 0;) { 2066 char ch = source.charAt(i); 2067 if (isTrailSurrogate(ch) && i > 0) { 2068 char ch2 = source.charAt(i - 1); 2069 if (isLeadSurrogate(ch2)) { 2070 result.append(ch2); 2071 result.append(ch); 2072 --i; 2073 continue; 2074 } 2075 } 2076 result.append(ch); 2077 } 2078 return result; 2079 } 2080 2081 /** 2082 * Check if the string contains more Unicode code points than a certain number. This is more 2083 * efficient than counting all code points in the entire string and comparing that number with a 2084 * threshold. This function may not need to scan the string at all if the length is within a 2085 * certain range, and never needs to count more than 'number + 1' code points. Logically 2086 * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two 2087 * code units. 2088 * 2089 * @param source The input string. 2090 * @param number The number of code points in the string is compared against the 'number' 2091 * parameter. 2092 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2093 * @stable ICU 2.4 2094 */ hasMoreCodePointsThan(String source, int number)2095 public static boolean hasMoreCodePointsThan(String source, int number) { 2096 if (number < 0) { 2097 return true; 2098 } 2099 if (source == null) { 2100 return false; 2101 } 2102 int length = source.length(); 2103 2104 // length >= 0 known 2105 // source contains at least (length + 1) / 2 code points: <= 2 2106 // chars per cp 2107 if (((length + 1) >> 1) > number) { 2108 return true; 2109 } 2110 2111 // check if source does not even contain enough chars 2112 int maxsupplementary = length - number; 2113 if (maxsupplementary <= 0) { 2114 return false; 2115 } 2116 2117 // there are maxsupplementary = length - number more chars than 2118 // asked-for code points 2119 2120 // count code points until they exceed and also check that there are 2121 // no more than maxsupplementary supplementary code points (char pairs) 2122 int start = 0; 2123 while (true) { 2124 if (length == 0) { 2125 return false; 2126 } 2127 if (number == 0) { 2128 return true; 2129 } 2130 if (isLeadSurrogate(source.charAt(start++)) && start != length 2131 && isTrailSurrogate(source.charAt(start))) { 2132 start++; 2133 if (--maxsupplementary <= 0) { 2134 // too many pairs - too few code points 2135 return false; 2136 } 2137 } 2138 --number; 2139 } 2140 } 2141 2142 /** 2143 * Check if the sub-range of char array, from argument start to limit, contains more Unicode 2144 * code points than a certain number. This is more efficient than counting all code points in 2145 * the entire char array range and comparing that number with a threshold. This function may not 2146 * need to scan the char array at all if start and limit is within a certain range, and never 2147 * needs to count more than 'number + 1' code points. Logically equivalent to 2148 * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one 2149 * or two code units. 2150 * 2151 * @param source Array of UTF-16 chars 2152 * @param start Offset to substring in the source array for analyzing 2153 * @param limit Offset to substring in the source array for analyzing 2154 * @param number The number of code points in the string is compared against the 'number' 2155 * parameter. 2156 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2157 * @exception IndexOutOfBoundsException Thrown when limit < start 2158 * @stable ICU 2.4 2159 */ hasMoreCodePointsThan(char source[], int start, int limit, int number)2160 public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) { 2161 int length = limit - start; 2162 if (length < 0 || start < 0 || limit < 0) { 2163 throw new IndexOutOfBoundsException( 2164 "Start and limit indexes should be non-negative and start <= limit"); 2165 } 2166 if (number < 0) { 2167 return true; 2168 } 2169 if (source == null) { 2170 return false; 2171 } 2172 2173 // length >= 0 known 2174 // source contains at least (length + 1) / 2 code points: <= 2 2175 // chars per cp 2176 if (((length + 1) >> 1) > number) { 2177 return true; 2178 } 2179 2180 // check if source does not even contain enough chars 2181 int maxsupplementary = length - number; 2182 if (maxsupplementary <= 0) { 2183 return false; 2184 } 2185 2186 // there are maxsupplementary = length - number more chars than 2187 // asked-for code points 2188 2189 // count code points until they exceed and also check that there are 2190 // no more than maxsupplementary supplementary code points (char pairs) 2191 while (true) { 2192 if (length == 0) { 2193 return false; 2194 } 2195 if (number == 0) { 2196 return true; 2197 } 2198 if (isLeadSurrogate(source[start++]) && start != limit 2199 && isTrailSurrogate(source[start])) { 2200 start++; 2201 if (--maxsupplementary <= 0) { 2202 // too many pairs - too few code points 2203 return false; 2204 } 2205 } 2206 --number; 2207 } 2208 } 2209 2210 /** 2211 * Check if the string buffer contains more Unicode code points than a certain number. This is 2212 * more efficient than counting all code points in the entire string buffer and comparing that 2213 * number with a threshold. This function may not need to scan the string buffer at all if the 2214 * length is within a certain range, and never needs to count more than 'number + 1' code 2215 * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may 2216 * occupy either one or two code units. 2217 * 2218 * @param source The input string buffer. 2219 * @param number The number of code points in the string buffer is compared against the 'number' 2220 * parameter. 2221 * @return boolean value for whether the string buffer contains more Unicode code points than 2222 * 'number'. 2223 * @stable ICU 2.4 2224 */ hasMoreCodePointsThan(StringBuffer source, int number)2225 public static boolean hasMoreCodePointsThan(StringBuffer source, int number) { 2226 if (number < 0) { 2227 return true; 2228 } 2229 if (source == null) { 2230 return false; 2231 } 2232 int length = source.length(); 2233 2234 // length >= 0 known 2235 // source contains at least (length + 1) / 2 code points: <= 2 2236 // chars per cp 2237 if (((length + 1) >> 1) > number) { 2238 return true; 2239 } 2240 2241 // check if source does not even contain enough chars 2242 int maxsupplementary = length - number; 2243 if (maxsupplementary <= 0) { 2244 return false; 2245 } 2246 2247 // there are maxsupplementary = length - number more chars than 2248 // asked-for code points 2249 2250 // count code points until they exceed and also check that there are 2251 // no more than maxsupplementary supplementary code points (char pairs) 2252 int start = 0; 2253 while (true) { 2254 if (length == 0) { 2255 return false; 2256 } 2257 if (number == 0) { 2258 return true; 2259 } 2260 if (isLeadSurrogate(source.charAt(start++)) && start != length 2261 && isTrailSurrogate(source.charAt(start))) { 2262 start++; 2263 if (--maxsupplementary <= 0) { 2264 // too many pairs - too few code points 2265 return false; 2266 } 2267 } 2268 --number; 2269 } 2270 } 2271 2272 /** 2273 * Cover JDK 1.5 API. Create a String from an array of codePoints. 2274 * 2275 * @param codePoints The code array 2276 * @param offset The start of the text in the code point array 2277 * @param count The number of code points 2278 * @return a String representing the code points between offset and count 2279 * @throws IllegalArgumentException If an invalid code point is encountered 2280 * @throws IndexOutOfBoundsException If the offset or count are out of bounds. 2281 * @stable ICU 3.0 2282 */ newString(int[] codePoints, int offset, int count)2283 public static String newString(int[] codePoints, int offset, int count) { 2284 if (count < 0) { 2285 throw new IllegalArgumentException(); 2286 } 2287 char[] chars = new char[count]; 2288 int w = 0; 2289 for (int r = offset, e = offset + count; r < e; ++r) { 2290 int cp = codePoints[r]; 2291 if (cp < 0 || cp > 0x10ffff) { 2292 throw new IllegalArgumentException(); 2293 } 2294 while (true) { 2295 try { 2296 if (cp < 0x010000) { 2297 chars[w] = (char) cp; 2298 w++; 2299 } else { 2300 chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); 2301 chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); 2302 w += 2; 2303 } 2304 break; 2305 } catch (IndexOutOfBoundsException ex) { 2306 int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2) 2307 / (r - offset + 1))); 2308 char[] temp = new char[newlen]; 2309 System.arraycopy(chars, 0, temp, 0, w); 2310 chars = temp; 2311 } 2312 } 2313 } 2314 return new String(chars, 0, w); 2315 } 2316 2317 /** 2318 * <p> 2319 * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various 2320 * modes 2321 * </p> 2322 * <ul> 2323 * <li> Code point comparison or code unit comparison 2324 * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison 2325 * with special handling for character 'i'. 2326 * </ul> 2327 * <p> 2328 * The code unit or code point comparison differ only when comparing supplementary code points 2329 * (\u10000..\u10ffff) to BMP code points near the end of the BMP (i.e., 2330 * \ue000..\uffff). In code unit comparison, high BMP code points sort after 2331 * supplementary code points because they are stored as pairs of surrogates which are at 2332 * \ud800..\udfff. 2333 * </p> 2334 * 2335 * @see #FOLD_CASE_DEFAULT 2336 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2337 * @stable ICU 2.1 2338 */ 2339 public static final class StringComparator implements java.util.Comparator<String> { 2340 // public constructor ------------------------------------------------ 2341 2342 /** 2343 * Default constructor that does code unit comparison and case sensitive comparison. 2344 * 2345 * @stable ICU 2.1 2346 */ StringComparator()2347 public StringComparator() { 2348 this(false, false, FOLD_CASE_DEFAULT); 2349 } 2350 2351 /** 2352 * Constructor that does comparison based on the argument options. 2353 * 2354 * @param codepointcompare Flag to indicate true for code point comparison or false for code unit 2355 * comparison. 2356 * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison 2357 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2358 * when ignorecase is set to true. If ignorecase is false, this option is 2359 * ignored. 2360 * @see #FOLD_CASE_DEFAULT 2361 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2362 * @throws IllegalArgumentException If foldcaseoption is out of range 2363 * @stable ICU 2.4 2364 */ StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption)2365 public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) { 2366 setCodePointCompare(codepointcompare); 2367 m_ignoreCase_ = ignorecase; 2368 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2369 throw new IllegalArgumentException("Invalid fold case option"); 2370 } 2371 m_foldCase_ = foldcaseoption; 2372 } 2373 2374 // public data member ------------------------------------------------ 2375 2376 /** 2377 * Option value for case folding comparison: 2378 * 2379 * <p>Comparison is case insensitive, strings are folded using default mappings defined in 2380 * Unicode data file CaseFolding.txt, before comparison. 2381 * 2382 * @stable ICU 2.4 2383 */ 2384 public static final int FOLD_CASE_DEFAULT = 0; 2385 2386 /** 2387 * Option value for case folding: 2388 * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I 2389 * and dotless i appropriately for Turkic languages (tr, az). 2390 * 2391 * <p>Comparison is case insensitive, strings are folded using modified mappings defined in 2392 * Unicode data file CaseFolding.txt, before comparison. 2393 * 2394 * @stable ICU 2.4 2395 * @see com.ibm.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I 2396 */ 2397 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1; 2398 2399 // public methods ---------------------------------------------------- 2400 2401 // public setters ---------------------------------------------------- 2402 2403 /** 2404 * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode 2405 * is set to code unit compare 2406 * 2407 * @param flag True for code point compare, false for code unit compare 2408 * @stable ICU 2.4 2409 */ setCodePointCompare(boolean flag)2410 public void setCodePointCompare(boolean flag) { 2411 if (flag) { 2412 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER; 2413 } else { 2414 m_codePointCompare_ = 0; 2415 } 2416 } 2417 2418 /** 2419 * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise 2420 * case sensitive comparison mode if set to false. 2421 * 2422 * @param ignorecase True for case-insensitive comparison, false for case sensitive comparison 2423 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2424 * when ignorecase is set to true. If ignorecase is false, this option is 2425 * ignored. 2426 * @see #FOLD_CASE_DEFAULT 2427 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2428 * @stable ICU 2.4 2429 */ setIgnoreCase(boolean ignorecase, int foldcaseoption)2430 public void setIgnoreCase(boolean ignorecase, int foldcaseoption) { 2431 m_ignoreCase_ = ignorecase; 2432 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2433 throw new IllegalArgumentException("Invalid fold case option"); 2434 } 2435 m_foldCase_ = foldcaseoption; 2436 } 2437 2438 // public getters ---------------------------------------------------- 2439 2440 /** 2441 * Checks if the comparison mode is code point compare. 2442 * 2443 * @return true for code point compare, false for code unit compare 2444 * @stable ICU 2.4 2445 */ getCodePointCompare()2446 public boolean getCodePointCompare() { 2447 return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2448 } 2449 2450 /** 2451 * Checks if Comparator is in the case insensitive mode. 2452 * 2453 * @return true if Comparator performs case insensitive comparison, false otherwise 2454 * @stable ICU 2.4 2455 */ getIgnoreCase()2456 public boolean getIgnoreCase() { 2457 return m_ignoreCase_; 2458 } 2459 2460 /** 2461 * Gets the fold case options set in Comparator to be used with case insensitive comparison. 2462 * 2463 * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I 2464 * @see #FOLD_CASE_DEFAULT 2465 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2466 * @stable ICU 2.4 2467 */ getIgnoreCaseOption()2468 public int getIgnoreCaseOption() { 2469 return m_foldCase_; 2470 } 2471 2472 // public other methods ---------------------------------------------- 2473 2474 /** 2475 * Compare two strings depending on the options selected during construction. 2476 * 2477 * @param a first source string. 2478 * @param b second source string. 2479 * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b, 2480 * a positive value is returned. 2481 * @exception ClassCastException thrown when either a or b is not a String object 2482 * @stable ICU 4.4 2483 */ 2484 @Override compare(String a, String b)2485 public int compare(String a, String b) { 2486 if (Utility.sameObjects(a, b)) { 2487 return 0; 2488 } 2489 if (a == null) { 2490 return -1; 2491 } 2492 if (b == null) { 2493 return 1; 2494 } 2495 2496 if (m_ignoreCase_) { 2497 return compareCaseInsensitive(a, b); 2498 } 2499 return compareCaseSensitive(a, b); 2500 } 2501 2502 // private data member ---------------------------------------------- 2503 2504 /** 2505 * Code unit comparison flag. True if code unit comparison is required. False if code point 2506 * comparison is required. 2507 */ 2508 private int m_codePointCompare_; 2509 2510 /** 2511 * Fold case comparison option. 2512 */ 2513 private int m_foldCase_; 2514 2515 /** 2516 * Flag indicator if ignore case is to be used during comparison 2517 */ 2518 private boolean m_ignoreCase_; 2519 2520 /** 2521 * Code point order offset for surrogate characters 2522 */ 2523 private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800; 2524 2525 // private method --------------------------------------------------- 2526 2527 /** 2528 * Compares case insensitive. This is a direct port of ICU4C, to make maintenance life 2529 * easier. 2530 * 2531 * @param s1 2532 * first string to compare 2533 * @param s2 2534 * second string to compare 2535 * @return -1 is s1 < s2, 0 if equals, 2536 */ compareCaseInsensitive(String s1, String s2)2537 private int compareCaseInsensitive(String s1, String s2) { 2538 return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_ 2539 | Normalizer.COMPARE_IGNORE_CASE); 2540 } 2541 2542 /** 2543 * Compares case sensitive. This is a direct port of ICU4C, to make maintenance life 2544 * easier. 2545 * 2546 * @param s1 2547 * first string to compare 2548 * @param s2 2549 * second string to compare 2550 * @return -1 is s1 < s2, 0 if equals, 2551 */ compareCaseSensitive(String s1, String s2)2552 private int compareCaseSensitive(String s1, String s2) { 2553 // compare identical prefixes - they do not need to be fixed up 2554 // limit1 = start1 + min(length1, length2) 2555 int length1 = s1.length(); 2556 int length2 = s2.length(); 2557 int minlength = length1; 2558 int result = 0; 2559 if (length1 < length2) { 2560 result = -1; 2561 } else if (length1 > length2) { 2562 result = 1; 2563 minlength = length2; 2564 } 2565 2566 char c1 = 0; 2567 char c2 = 0; 2568 int index = 0; 2569 for (; index < minlength; index++) { 2570 c1 = s1.charAt(index); 2571 c2 = s2.charAt(index); 2572 // check pseudo-limit 2573 if (c1 != c2) { 2574 break; 2575 } 2576 } 2577 2578 if (index == minlength) { 2579 return result; 2580 } 2581 2582 boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2583 // if both values are in or above the surrogate range, fix them up 2584 if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE 2585 && codepointcompare) { 2586 // subtract 0x2800 from BMP code points to make them smaller 2587 // than supplementary ones 2588 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1))) 2589 || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) { 2590 // part of a surrogate pair, leave >=d800 2591 } else { 2592 // BMP code point - may be surrogate code point - make 2593 // < d800 2594 c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2595 } 2596 2597 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1))) 2598 || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) { 2599 // part of a surrogate pair, leave >=d800 2600 } else { 2601 // BMP code point - may be surrogate code point - make <d800 2602 c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2603 } 2604 } 2605 2606 // now c1 and c2 are in UTF-32-compatible order 2607 return c1 - c2; 2608 } 2609 } 2610 2611 /** 2612 * Utility for getting a code point from a CharSequence that contains exactly one code point. 2613 * @return the code point IF the string is non-null and consists of a single code point. 2614 * otherwise returns -1. 2615 * @param s to test 2616 * @stable ICU 54 2617 */ getSingleCodePoint(CharSequence s)2618 public static int getSingleCodePoint(CharSequence s) { 2619 if (s == null || s.length() == 0) { 2620 return -1; 2621 } else if (s.length() == 1) { 2622 return s.charAt(0); 2623 } else if (s.length() > 2) { 2624 return -1; 2625 } 2626 2627 // at this point, len = 2 2628 int cp = Character.codePointAt(s, 0); 2629 if (cp > 0xFFFF) { // is surrogate pair 2630 return cp; 2631 } 2632 return -1; 2633 } 2634 2635 /** 2636 * Utility for comparing a code point to a string without having to create a new string. Returns the same results 2637 * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if 2638 * <pre> 2639 * sc = new StringComparator(true,false,0); 2640 * fast = UTF16.compareCodePoint(codePoint, charSequence) 2641 * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString()) 2642 * </pre> 2643 * then 2644 * <pre> 2645 * Integer.signum(fast) == Integer.signum(slower) 2646 * </pre> 2647 * @param codePoint to test 2648 * @param s to test 2649 * @return equivalent of code point comparator comparing two strings. 2650 * @stable ICU 54 2651 */ compareCodePoint(int codePoint, CharSequence s)2652 public static int compareCodePoint(int codePoint, CharSequence s) { 2653 if (s == null) { 2654 return 1; 2655 } 2656 final int strLen = s.length(); 2657 if (strLen == 0) { 2658 return 1; 2659 } 2660 int second = Character.codePointAt(s, 0); 2661 int diff = codePoint - second; 2662 if (diff != 0) { 2663 return diff; 2664 } 2665 return strLen == Character.charCount(codePoint) ? 0 : -1; 2666 } 2667 2668 // private data members ------------------------------------------------- 2669 2670 /** 2671 * Shift value for lead surrogate to form a supplementary character. 2672 */ 2673 private static final int LEAD_SURROGATE_SHIFT_ = 10; 2674 2675 /** 2676 * Mask to retrieve the significant value from a trail surrogate. 2677 */ 2678 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; 2679 2680 /** 2681 * Value that all lead surrogate starts with 2682 */ 2683 private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE 2684 - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); 2685 2686 // private methods ------------------------------------------------------ 2687 2688 /** 2689 * <p> 2690 * Converts argument code point and returns a String object representing the code point's value 2691 * in UTF16 format. 2692 * </p> 2693 * <p> 2694 * This method does not check for the validity of the codepoint, the results are not guaranteed 2695 * if a invalid codepoint is passed as argument. 2696 * </p> 2697 * <p> 2698 * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise. 2699 * </p> 2700 * 2701 * @param ch 2702 * code point 2703 * @return string representation of the code point 2704 */ toString(int ch)2705 private static String toString(int ch) { 2706 if (ch < SUPPLEMENTARY_MIN_VALUE) { 2707 return String.valueOf((char) ch); 2708 } 2709 2710 StringBuilder result = new StringBuilder(); 2711 result.append(getLeadSurrogate(ch)); 2712 result.append(getTrailSurrogate(ch)); 2713 return result.toString(); 2714 } 2715 } 2716 // eof 2717