• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  *******************************************************************************
3  * Copyright (C) 1996-2016, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 
8 package com.ibm.icu.text;
9 
10 /**
11  * <p>
12  * Standalone utility class providing UTF16 character conversions and indexing conversions.
13  * </p>
14  * <p>
15  * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
16  * so searching for strings is a safe operation. Similarly, concatenation is always safe.
17  * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
18  * values for start and end are on those boundaries, since they arose from operations like
19  * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
20  * </p>
21  * <strong>Examples:</strong>
22  * <p>
23  * The following examples illustrate use of some of these methods.
24  *
25  * <pre>
26  * // iteration forwards: Original
27  * for (int i = 0; i &lt; s.length(); ++i) {
28  *     char ch = s.charAt(i);
29  *     doSomethingWith(ch);
30  * }
31  *
32  * // iteration forwards: Changes for UTF-32
33  * int ch;
34  * for (int i = 0; i &lt; s.length(); i += UTF16.getCharCount(ch)) {
35  *     ch = UTF16.charAt(s, i);
36  *     doSomethingWith(ch);
37  * }
38  *
39  * // iteration backwards: Original
40  * for (int i = s.length() - 1; i &gt;= 0; --i) {
41  *     char ch = s.charAt(i);
42  *     doSomethingWith(ch);
43  * }
44  *
45  * // iteration backwards: Changes for UTF-32
46  * int ch;
47  * for (int i = s.length() - 1; i &gt; 0; i -= UTF16.getCharCount(ch)) {
48  *     ch = UTF16.charAt(s, i);
49  *     doSomethingWith(ch);
50  * }
51  * </pre>
52  *
53  * <strong>Notes:</strong>
54  * <ul>
55  * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code>
56  * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string.
57  * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16
58  * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32
59  * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li>
60  * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a
61  * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16
62  * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
63  * </li>
64  * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out
65  * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates
66  * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to
67  * check for validity if desired. </li>
68  * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then
69  * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It
70  * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4,
71  * 5.5). </li>
72  * <li> <strong>Optimization:</strong> The method implementations may need optimization if the
73  * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small
74  * percentage of all the text in the world, the singleton case should always be optimized for. </li>
75  * </ul>
76  *
77  * @author Mark Davis, with help from Markus Scherer
78  * @stable ICU 2.1
79  */
80 
81 public final class UTF16 {
82     // public variables ---------------------------------------------------
83 
84     /**
85      * Value returned in {@link #bounds(String, int) bounds()}.
86      * These values are chosen specifically so that it actually represents the position of the
87      * character [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)]
88      *
89      * @stable ICU 2.1
90      */
91     public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
92             TRAIL_SURROGATE_BOUNDARY = 5;
93 
94     /**
95      * The lowest Unicode code point value.
96      *
97      * @stable ICU 2.1
98      */
99     public static final int CODEPOINT_MIN_VALUE = 0;
100 
101     /**
102      * The highest Unicode code point value (scalar value) according to the Unicode Standard.
103      *
104      * @stable ICU 2.1
105      */
106     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
107 
108     /**
109      * The minimum value for Supplementary code points
110      *
111      * @stable ICU 2.1
112      */
113     public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
114 
115     /**
116      * Lead surrogate minimum value
117      *
118      * @stable ICU 2.1
119      */
120     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
121 
122     /**
123      * Trail surrogate minimum value
124      *
125      * @stable ICU 2.1
126      */
127     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
128 
129     /**
130      * Lead surrogate maximum value
131      *
132      * @stable ICU 2.1
133      */
134     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
135 
136     /**
137      * Trail surrogate maximum value
138      *
139      * @stable ICU 2.1
140      */
141     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
142 
143     /**
144      * Surrogate minimum value
145      *
146      * @stable ICU 2.1
147      */
148     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
149 
150     /**
151      * Maximum surrogate value
152      *
153      * @stable ICU 2.1
154      */
155     public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
156 
157     /**
158      * Lead surrogate bitmask
159      */
160     private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
161 
162     /**
163      * Trail surrogate bitmask
164      */
165     private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
166 
167     /**
168      * Surrogate bitmask
169      */
170     private static final int SURROGATE_BITMASK = 0xFFFFF800;
171 
172     /**
173      * Lead surrogate bits
174      */
175     private static final int LEAD_SURROGATE_BITS = 0xD800;
176 
177     /**
178      * Trail surrogate bits
179      */
180     private static final int TRAIL_SURROGATE_BITS = 0xDC00;
181 
182     /**
183      * Surrogate bits
184      */
185     private static final int SURROGATE_BITS = 0xD800;
186 
187     // constructor --------------------------------------------------------
188 
189     // /CLOVER:OFF
190     /**
191      * Prevent instance from being created.
192      */
UTF16()193     private UTF16() {
194     }
195 
196     // /CLOVER:ON
197     // public method ------------------------------------------------------
198 
199     /**
200      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
201      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
202      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
203      * UCharacter.isLegal()</a></code>
204      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
205      * character will be returned. If a complete supplementary character is not found the incomplete
206      * character will be returned
207      *
208      * @param source Array of UTF-16 chars
209      * @param offset16 UTF-16 offset to the start of the character.
210      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
211      *         of that codepoint are the same as in <code>bounds32()</code>.
212      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
213      * @stable ICU 2.1
214      */
charAt(String source, int offset16)215     public static int charAt(String source, int offset16) {
216         char single = source.charAt(offset16);
217         if (single < LEAD_SURROGATE_MIN_VALUE) {
218             return single;
219         }
220         return _charAt(source, offset16, single);
221     }
222 
_charAt(String source, int offset16, char single)223     private static int _charAt(String source, int offset16, char single) {
224         if (single > TRAIL_SURROGATE_MAX_VALUE) {
225             return single;
226         }
227 
228         // Convert the UTF-16 surrogate pair if necessary.
229         // For simplicity in usage, and because the frequency of pairs is
230         // low, look both directions.
231 
232         if (single <= LEAD_SURROGATE_MAX_VALUE) {
233             ++offset16;
234             if (source.length() != offset16) {
235                 char trail = source.charAt(offset16);
236                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
237                     return Character.toCodePoint(single, trail);
238                 }
239             }
240         } else {
241             --offset16;
242             if (offset16 >= 0) {
243                 // single is a trail surrogate so
244                 char lead = source.charAt(offset16);
245                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
246                     return Character.toCodePoint(lead, single);
247                 }
248             }
249         }
250         return single; // return unmatched surrogate
251     }
252 
253     /**
254      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
255      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
256      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
257      * UCharacter.isLegal()</a></code>
258      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
259      * character will be returned. If a complete supplementary character is not found the incomplete
260      * character will be returned
261      *
262      * @param source Array of UTF-16 chars
263      * @param offset16 UTF-16 offset to the start of the character.
264      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
265      *         of that codepoint are the same as in <code>bounds32()</code>.
266      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
267      * @stable ICU 2.1
268      */
charAt(CharSequence source, int offset16)269     public static int charAt(CharSequence source, int offset16) {
270         char single = source.charAt(offset16);
271         if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
272             return single;
273         }
274         return _charAt(source, offset16, single);
275     }
276 
_charAt(CharSequence source, int offset16, char single)277     private static int _charAt(CharSequence source, int offset16, char single) {
278         if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
279             return single;
280         }
281 
282         // Convert the UTF-16 surrogate pair if necessary.
283         // For simplicity in usage, and because the frequency of pairs is
284         // low, look both directions.
285 
286         if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
287             ++offset16;
288             if (source.length() != offset16) {
289                 char trail = source.charAt(offset16);
290                 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
291                         && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
292                     return Character.toCodePoint(single, trail);
293                 }
294             }
295         } else {
296             --offset16;
297             if (offset16 >= 0) {
298                 // single is a trail surrogate so
299                 char lead = source.charAt(offset16);
300                 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
301                         && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
302                     return Character.toCodePoint(lead, single);
303                 }
304             }
305         }
306         return single; // return unmatched surrogate
307     }
308 
309     /**
310      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
311      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
312      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
313      * </a></code>
314      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
315      * character will be returned. If a complete supplementary character is not found the incomplete
316      * character will be returned
317      *
318      * @param source UTF-16 chars string buffer
319      * @param offset16 UTF-16 offset to the start of the character.
320      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
321      *         of that codepoint are the same as in <code>bounds32()</code>.
322      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
323      * @stable ICU 2.1
324      */
charAt(StringBuffer source, int offset16)325     public static int charAt(StringBuffer source, int offset16) {
326         if (offset16 < 0 || offset16 >= source.length()) {
327             throw new StringIndexOutOfBoundsException(offset16);
328         }
329 
330         char single = source.charAt(offset16);
331         if (!isSurrogate(single)) {
332             return single;
333         }
334 
335         // Convert the UTF-16 surrogate pair if necessary.
336         // For simplicity in usage, and because the frequency of pairs is
337         // low, look both directions.
338 
339         if (single <= LEAD_SURROGATE_MAX_VALUE) {
340             ++offset16;
341             if (source.length() != offset16) {
342                 char trail = source.charAt(offset16);
343                 if (isTrailSurrogate(trail))
344                     return Character.toCodePoint(single, trail);
345             }
346         } else {
347             --offset16;
348             if (offset16 >= 0) {
349                 // single is a trail surrogate so
350                 char lead = source.charAt(offset16);
351                 if (isLeadSurrogate(lead)) {
352                     return Character.toCodePoint(lead, single);
353                 }
354             }
355         }
356         return single; // return unmatched surrogate
357     }
358 
359     /**
360      * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
361      * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
362      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
363      * </a></code>
364      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
365      * character will be returned. If a complete supplementary character is not found the incomplete
366      * character will be returned
367      *
368      * @param source Array of UTF-16 chars
369      * @param start Offset to substring in the source array for analyzing
370      * @param limit Offset to substring in the source array for analyzing
371      * @param offset16 UTF-16 offset relative to start
372      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
373      *         of that codepoint are the same as in <code>bounds32()</code>.
374      * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
375      * @stable ICU 2.1
376      */
charAt(char source[], int start, int limit, int offset16)377     public static int charAt(char source[], int start, int limit, int offset16) {
378         offset16 += start;
379         if (offset16 < start || offset16 >= limit) {
380             throw new ArrayIndexOutOfBoundsException(offset16);
381         }
382 
383         char single = source[offset16];
384         if (!isSurrogate(single)) {
385             return single;
386         }
387 
388         // Convert the UTF-16 surrogate pair if necessary.
389         // For simplicity in usage, and because the frequency of pairs is
390         // low, look both directions.
391         if (single <= LEAD_SURROGATE_MAX_VALUE) {
392             offset16++;
393             if (offset16 >= limit) {
394                 return single;
395             }
396             char trail = source[offset16];
397             if (isTrailSurrogate(trail)) {
398                 return Character.toCodePoint(single, trail);
399             }
400         } else { // isTrailSurrogate(single), so
401             if (offset16 == start) {
402                 return single;
403             }
404             offset16--;
405             char lead = source[offset16];
406             if (isLeadSurrogate(lead))
407                 return Character.toCodePoint(lead, single);
408         }
409         return single; // return unmatched surrogate
410     }
411 
412     /**
413      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
414      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
415      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
416      * </a></code>
417      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
418      * character will be returned. If a complete supplementary character is not found the incomplete
419      * character will be returned
420      *
421      * @param source UTF-16 chars string buffer
422      * @param offset16 UTF-16 offset to the start of the character.
423      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
424      *         of that codepoint are the same as in <code>bounds32()</code>.
425      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
426      * @stable ICU 2.1
427      */
charAt(Replaceable source, int offset16)428     public static int charAt(Replaceable source, int offset16) {
429         if (offset16 < 0 || offset16 >= source.length()) {
430             throw new StringIndexOutOfBoundsException(offset16);
431         }
432 
433         char single = source.charAt(offset16);
434         if (!isSurrogate(single)) {
435             return single;
436         }
437 
438         // Convert the UTF-16 surrogate pair if necessary.
439         // For simplicity in usage, and because the frequency of pairs is
440         // low, look both directions.
441 
442         if (single <= LEAD_SURROGATE_MAX_VALUE) {
443             ++offset16;
444             if (source.length() != offset16) {
445                 char trail = source.charAt(offset16);
446                 if (isTrailSurrogate(trail))
447                     return Character.toCodePoint(single, trail);
448             }
449         } else {
450             --offset16;
451             if (offset16 >= 0) {
452                 // single is a trail surrogate so
453                 char lead = source.charAt(offset16);
454                 if (isLeadSurrogate(lead)) {
455                     return Character.toCodePoint(lead, single);
456                 }
457             }
458         }
459         return single; // return unmatched surrogate
460     }
461 
462     /**
463      * Determines how many chars this char32 requires. If a validity check is required, use <code>
464      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
465      * on char32 before calling.
466      *
467      * @param char32 The input codepoint.
468      * @return 2 if is in supplementary space, otherwise 1.
469      * @stable ICU 2.1
470      */
getCharCount(int char32)471     public static int getCharCount(int char32) {
472         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
473             return 1;
474         }
475         return 2;
476     }
477 
478     /**
479      * Returns the type of the boundaries around the char at offset16. Used for random access.
480      *
481      * @param source Text to analyse
482      * @param offset16 UTF-16 offset
483      * @return
484      *            <ul>
485      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1]
486      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
487      *            are [offset16, offset16 + 2]
488      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
489      *            bounds are [offset16 - 1, offset16 + 1]
490      *            </ul>
491      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
492      *            can be gotten by: [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)].
493      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
494      * @stable ICU 2.1
495      */
bounds(String source, int offset16)496     public static int bounds(String source, int offset16) {
497         char ch = source.charAt(offset16);
498         if (isSurrogate(ch)) {
499             if (isLeadSurrogate(ch)) {
500                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
501                     return LEAD_SURROGATE_BOUNDARY;
502                 }
503             } else {
504                 // isTrailSurrogate(ch), so
505                 --offset16;
506                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
507                     return TRAIL_SURROGATE_BOUNDARY;
508                 }
509             }
510         }
511         return SINGLE_CHAR_BOUNDARY;
512     }
513 
514     /**
515      * Returns the type of the boundaries around the char at offset16. Used for random access.
516      *
517      * @param source String buffer to analyse
518      * @param offset16 UTF16 offset
519      * @return
520      *            <ul>
521      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1]
522      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
523      *            are [offset16, offset16 + 2]
524      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
525      *            bounds are [offset16 - 1, offset16 + 1]
526      *            </ul>
527      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
528      *            can be gotten by: [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)].
529      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
530      * @stable ICU 2.1
531      */
bounds(StringBuffer source, int offset16)532     public static int bounds(StringBuffer source, int offset16) {
533         char ch = source.charAt(offset16);
534         if (isSurrogate(ch)) {
535             if (isLeadSurrogate(ch)) {
536                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
537                     return LEAD_SURROGATE_BOUNDARY;
538                 }
539             } else {
540                 // isTrailSurrogate(ch), so
541                 --offset16;
542                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
543                     return TRAIL_SURROGATE_BOUNDARY;
544                 }
545             }
546         }
547         return SINGLE_CHAR_BOUNDARY;
548     }
549 
550     /**
551      * Returns the type of the boundaries around the char at offset16. Used for random access. Note
552      * that the boundaries are determined with respect to the subarray, hence the char array
553      * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
554      *
555      * @param source Char array to analyse
556      * @param start Offset to substring in the source array for analyzing
557      * @param limit Offset to substring in the source array for analyzing
558      * @param offset16 UTF16 offset relative to start
559      * @return
560      *            <ul>
561      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
562      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
563      *            are [offset16, offset16 + 2]
564      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
565      *            bounds are [offset16 - 1, offset16 + 1]
566      *            </ul>
567      *            For bit-twiddlers, the boundary values for these are chosen so that the boundaries
568      *            can be gotten by: [offset16 - (boundvalue &gt;&gt; 2), offset16 + (boundvalue &amp; 3)].
569      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
570      * @stable ICU 2.1
571      */
bounds(char source[], int start, int limit, int offset16)572     public static int bounds(char source[], int start, int limit, int offset16) {
573         offset16 += start;
574         if (offset16 < start || offset16 >= limit) {
575             throw new ArrayIndexOutOfBoundsException(offset16);
576         }
577         char ch = source[offset16];
578         if (isSurrogate(ch)) {
579             if (isLeadSurrogate(ch)) {
580                 ++offset16;
581                 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
582                     return LEAD_SURROGATE_BOUNDARY;
583                 }
584             } else { // isTrailSurrogate(ch), so
585                 --offset16;
586                 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
587                     return TRAIL_SURROGATE_BOUNDARY;
588                 }
589             }
590         }
591         return SINGLE_CHAR_BOUNDARY;
592     }
593 
594     /**
595      * Determines whether the code value is a surrogate.
596      *
597      * @param char16 The input character.
598      * @return true If the input character is a surrogate.
599      * @stable ICU 2.1
600      */
isSurrogate(char char16)601     public static boolean isSurrogate(char char16) {
602         return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
603     }
604 
605     /**
606      * Determines whether the character is a trail surrogate.
607      *
608      * @param char16 The input character.
609      * @return true If the input character is a trail surrogate.
610      * @stable ICU 2.1
611      */
isTrailSurrogate(char char16)612     public static boolean isTrailSurrogate(char char16) {
613         return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
614     }
615 
616     /**
617      * Determines whether the character is a lead surrogate.
618      *
619      * @param char16 The input character.
620      * @return true If the input character is a lead surrogate
621      * @stable ICU 2.1
622      */
isLeadSurrogate(char char16)623     public static boolean isLeadSurrogate(char char16) {
624         return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
625     }
626 
627     /**
628      * Returns the lead surrogate. If a validity check is required, use
629      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
630      * before calling.
631      *
632      * @param char32 The input character.
633      * @return lead surrogate if the getCharCount(ch) is 2; <br>
634      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
635      * @stable ICU 2.1
636      */
getLeadSurrogate(int char32)637     public static char getLeadSurrogate(int char32) {
638         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
639             return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
640         }
641         return 0;
642     }
643 
644     /**
645      * Returns the trail surrogate. If a validity check is required, use
646      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
647      * before calling.
648      *
649      * @param char32 The input character.
650      * @return the trail surrogate if the getCharCount(ch) is 2; <br>
651      *         otherwise the character itself
652      * @stable ICU 2.1
653      */
getTrailSurrogate(int char32)654     public static char getTrailSurrogate(int char32) {
655         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
656             return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
657         }
658         return (char) char32;
659     }
660 
661     /**
662      * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
663      * containing the UTF-32 value in UTF16 format. If a validity check is required, use
664      * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before calling.
665      *
666      * @param char32 The input character.
667      * @return string value of char32 in UTF16 format
668      * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint.
669      * @stable ICU 2.1
670      */
valueOf(int char32)671     public static String valueOf(int char32) {
672         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
673             throw new IllegalArgumentException("Illegal codepoint");
674         }
675         return toString(char32);
676     }
677 
678     /**
679      * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
680      * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
681      * character, the whole supplementary codepoint will be returned. If a validity check is
682      * required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the
683      * codepoint at offset16 before calling. The result returned will be a newly created String
684      * obtained by calling source.substring(..) with the appropriate indexes.
685      *
686      * @param source The input string.
687      * @param offset16 The UTF16 index to the codepoint in source
688      * @return string value of char32 in UTF16 format
689      * @stable ICU 2.1
690      */
valueOf(String source, int offset16)691     public static String valueOf(String source, int offset16) {
692         switch (bounds(source, offset16)) {
693         case LEAD_SURROGATE_BOUNDARY:
694             return source.substring(offset16, offset16 + 2);
695         case TRAIL_SURROGATE_BOUNDARY:
696             return source.substring(offset16 - 1, offset16 + 1);
697         default:
698             return source.substring(offset16, offset16 + 1);
699         }
700     }
701 
702     /**
703      * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
704      * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
705      * surrogate character, the whole supplementary codepoint will be returned. If a validity check
706      * is required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on
707      * the codepoint at offset16 before calling. The result returned will be a newly created String
708      * obtained by calling source.substring(..) with the appropriate indexes.
709      *
710      * @param source The input string buffer.
711      * @param offset16 The UTF16 index to the codepoint in source
712      * @return string value of char32 in UTF16 format
713      * @stable ICU 2.1
714      */
valueOf(StringBuffer source, int offset16)715     public static String valueOf(StringBuffer source, int offset16) {
716         switch (bounds(source, offset16)) {
717         case LEAD_SURROGATE_BOUNDARY:
718             return source.substring(offset16, offset16 + 2);
719         case TRAIL_SURROGATE_BOUNDARY:
720             return source.substring(offset16 - 1, offset16 + 1);
721         default:
722             return source.substring(offset16, offset16 + 1);
723         }
724     }
725 
726     /**
727      * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
728      * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
729      * returned, except when either the leading or trailing surrogate character lies out of the
730      * specified subarray. In the latter case, only the surrogate character within bounds will be
731      * returned. If a validity check is required, use
732      * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the codepoint at
733      * offset16 before calling. The result returned will be a newly created String containing the
734      * relevant characters.
735      *
736      * @param source The input char array.
737      * @param start Start index of the subarray
738      * @param limit End index of the subarray
739      * @param offset16 The UTF16 index to the codepoint in source relative to start
740      * @return string value of char32 in UTF16 format
741      * @stable ICU 2.1
742      */
valueOf(char source[], int start, int limit, int offset16)743     public static String valueOf(char source[], int start, int limit, int offset16) {
744         switch (bounds(source, start, limit, offset16)) {
745         case LEAD_SURROGATE_BOUNDARY:
746             return new String(source, start + offset16, 2);
747         case TRAIL_SURROGATE_BOUNDARY:
748             return new String(source, start + offset16 - 1, 2);
749         }
750         return new String(source, start + offset16, 1);
751     }
752 
753     /**
754      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
755      * the {@link UTF16 class description} for notes on roundtripping.
756      *
757      * @param source The UTF-16 string
758      * @param offset32 UTF-32 offset
759      * @return UTF-16 offset
760      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
761      * @stable ICU 2.1
762      */
findOffsetFromCodePoint(String source, int offset32)763     public static int findOffsetFromCodePoint(String source, int offset32) {
764         char ch;
765         int size = source.length(), result = 0, count = offset32;
766         if (offset32 < 0 || offset32 > size) {
767             throw new StringIndexOutOfBoundsException(offset32);
768         }
769         while (result < size && count > 0) {
770             ch = source.charAt(result);
771             if (isLeadSurrogate(ch) && ((result + 1) < size)
772                     && isTrailSurrogate(source.charAt(result + 1))) {
773                 result++;
774             }
775 
776             count--;
777             result++;
778         }
779         if (count != 0) {
780             throw new StringIndexOutOfBoundsException(offset32);
781         }
782         return result;
783     }
784 
785     /**
786      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
787      * the {@link UTF16 class description} for notes on roundtripping.
788      *
789      * @param source The UTF-16 string buffer
790      * @param offset32 UTF-32 offset
791      * @return UTF-16 offset
792      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
793      * @stable ICU 2.1
794      */
findOffsetFromCodePoint(StringBuffer source, int offset32)795     public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
796         char ch;
797         int size = source.length(), result = 0, count = offset32;
798         if (offset32 < 0 || offset32 > size) {
799             throw new StringIndexOutOfBoundsException(offset32);
800         }
801         while (result < size && count > 0) {
802             ch = source.charAt(result);
803             if (isLeadSurrogate(ch) && ((result + 1) < size)
804                     && isTrailSurrogate(source.charAt(result + 1))) {
805                 result++;
806             }
807 
808             count--;
809             result++;
810         }
811         if (count != 0) {
812             throw new StringIndexOutOfBoundsException(offset32);
813         }
814         return result;
815     }
816 
817     /**
818      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
819      * the {@link UTF16 class description} for notes on roundtripping.
820      *
821      * @param source The UTF-16 char array whose substring is to be analysed
822      * @param start Offset of the substring to be analysed
823      * @param limit Offset of the substring to be analysed
824      * @param offset32 UTF-32 offset relative to start
825      * @return UTF-16 offset relative to start
826      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
827      * @stable ICU 2.1
828      */
findOffsetFromCodePoint(char source[], int start, int limit, int offset32)829     public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
830         char ch;
831         int result = start, count = offset32;
832         if (offset32 > limit - start) {
833             throw new ArrayIndexOutOfBoundsException(offset32);
834         }
835         while (result < limit && count > 0) {
836             ch = source[result];
837             if (isLeadSurrogate(ch) && ((result + 1) < limit)
838                     && isTrailSurrogate(source[result + 1])) {
839                 result++;
840             }
841 
842             count--;
843             result++;
844         }
845         if (count != 0) {
846             throw new ArrayIndexOutOfBoundsException(offset32);
847         }
848         return result - start;
849     }
850 
851     /**
852      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
853      * UTF-16 offset. Used for random access. See the {@link UTF16 class description} for
854      * notes on roundtripping.<br>
855      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
856      * of the <strong>lead</strong> of the pair is returned. </i>
857      * <p>
858      * To find the UTF-32 length of a string, use:
859      *
860      * <pre>
861      * len32 = countCodePoint(source, source.length());
862      * </pre>
863      *
864      * @param source Text to analyse
865      * @param offset16 UTF-16 offset &lt; source text length.
866      * @return UTF-32 offset
867      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
868      * @stable ICU 2.1
869      */
findCodePointOffset(String source, int offset16)870     public static int findCodePointOffset(String source, int offset16) {
871         if (offset16 < 0 || offset16 > source.length()) {
872             throw new StringIndexOutOfBoundsException(offset16);
873         }
874 
875         int result = 0;
876         char ch;
877         boolean hadLeadSurrogate = false;
878 
879         for (int i = 0; i < offset16; ++i) {
880             ch = source.charAt(i);
881             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
882                 hadLeadSurrogate = false; // count valid trail as zero
883             } else {
884                 hadLeadSurrogate = isLeadSurrogate(ch);
885                 ++result; // count others as 1
886             }
887         }
888 
889         if (offset16 == source.length()) {
890             return result;
891         }
892 
893         // end of source being the less significant surrogate character
894         // shift result back to the start of the supplementary character
895         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
896             result--;
897         }
898 
899         return result;
900     }
901 
902     /**
903      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
904      * offset. Used for random access. See the {@link UTF16 class description} for notes on
905      * roundtripping.<br>
906      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
907      * of the <strong>lead</strong> of the pair is returned. </i>
908      * <p>
909      * To find the UTF-32 length of a string, use:
910      *
911      * <pre>
912      * len32 = countCodePoint(source);
913      * </pre>
914      *
915      * @param source Text to analyse
916      * @param offset16 UTF-16 offset &lt; source text length.
917      * @return UTF-32 offset
918      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
919      * @stable ICU 2.1
920      */
findCodePointOffset(StringBuffer source, int offset16)921     public static int findCodePointOffset(StringBuffer source, int offset16) {
922         if (offset16 < 0 || offset16 > source.length()) {
923             throw new StringIndexOutOfBoundsException(offset16);
924         }
925 
926         int result = 0;
927         char ch;
928         boolean hadLeadSurrogate = false;
929 
930         for (int i = 0; i < offset16; ++i) {
931             ch = source.charAt(i);
932             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
933                 hadLeadSurrogate = false; // count valid trail as zero
934             } else {
935                 hadLeadSurrogate = isLeadSurrogate(ch);
936                 ++result; // count others as 1
937             }
938         }
939 
940         if (offset16 == source.length()) {
941             return result;
942         }
943 
944         // end of source being the less significant surrogate character
945         // shift result back to the start of the supplementary character
946         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
947             result--;
948         }
949 
950         return result;
951     }
952 
953     /**
954      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
955      * offset. Used for random access. See the {@link UTF16 class description} for notes on
956      * roundtripping.<br>
957      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
958      * of the <strong>lead</strong> of the pair is returned. </i>
959      * <p>
960      * To find the UTF-32 length of a substring, use:
961      *
962      * <pre>
963      * len32 = countCodePoint(source, start, limit);
964      * </pre>
965      *
966      * @param source Text to analyse
967      * @param start Offset of the substring
968      * @param limit Offset of the substring
969      * @param offset16 UTF-16 relative to start
970      * @return UTF-32 offset relative to start
971      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
972      * @stable ICU 2.1
973      */
findCodePointOffset(char source[], int start, int limit, int offset16)974     public static int findCodePointOffset(char source[], int start, int limit, int offset16) {
975         offset16 += start;
976         if (offset16 > limit) {
977             throw new StringIndexOutOfBoundsException(offset16);
978         }
979 
980         int result = 0;
981         char ch;
982         boolean hadLeadSurrogate = false;
983 
984         for (int i = start; i < offset16; ++i) {
985             ch = source[i];
986             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
987                 hadLeadSurrogate = false; // count valid trail as zero
988             } else {
989                 hadLeadSurrogate = isLeadSurrogate(ch);
990                 ++result; // count others as 1
991             }
992         }
993 
994         if (offset16 == limit) {
995             return result;
996         }
997 
998         // end of source being the less significant surrogate character
999         // shift result back to the start of the supplementary character
1000         if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
1001             result--;
1002         }
1003 
1004         return result;
1005     }
1006 
1007     /**
1008      * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required,
1009      * use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before
1010      * calling.
1011      *
1012      * @param target The buffer to append to
1013      * @param char32 Value to append.
1014      * @return the updated StringBuffer
1015      * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints
1016      * @stable ICU 2.1
1017      */
append(StringBuffer target, int char32)1018     public static StringBuffer append(StringBuffer target, int char32) {
1019         // Check for irregular values
1020         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1021             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
1022         }
1023 
1024         // Write the UTF-16 values
1025         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1026             target.append(getLeadSurrogate(char32));
1027             target.append(getTrailSurrogate(char32));
1028         } else {
1029             target.append((char) char32);
1030         }
1031         return target;
1032     }
1033 
1034     /**
1035      * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
1036      * convenience.
1037      *
1038      * @param target The buffer to append to
1039      * @param cp The code point to append
1040      * @return the updated StringBuffer
1041      * @throws IllegalArgumentException If cp is not a valid code point
1042      * @stable ICU 3.0
1043      */
appendCodePoint(StringBuffer target, int cp)1044     public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
1045         return append(target, cp);
1046     }
1047 
1048     /**
1049      * Adds a codepoint to offset16 position of the argument char array.
1050      *
1051      * @param target Char array to be append with the new code point
1052      * @param limit UTF16 offset which the codepoint will be appended.
1053      * @param char32 Code point to be appended
1054      * @return offset after char32 in the array.
1055      * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not
1056      *                lie within the range of the Unicode codepoints.
1057      * @stable ICU 2.1
1058      */
append(char[] target, int limit, int char32)1059     public static int append(char[] target, int limit, int char32) {
1060         // Check for irregular values
1061         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1062             throw new IllegalArgumentException("Illegal codepoint");
1063         }
1064         // Write the UTF-16 values
1065         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1066             target[limit++] = getLeadSurrogate(char32);
1067             target[limit++] = getTrailSurrogate(char32);
1068         } else {
1069             target[limit++] = (char) char32;
1070         }
1071         return limit;
1072     }
1073 
1074     /**
1075      * Number of codepoints in a UTF16 String
1076      *
1077      * @param source UTF16 string
1078      * @return number of codepoint in string
1079      * @stable ICU 2.1
1080      */
countCodePoint(String source)1081     public static int countCodePoint(String source) {
1082         if (source == null || source.length() == 0) {
1083             return 0;
1084         }
1085         return findCodePointOffset(source, source.length());
1086     }
1087 
1088     /**
1089      * Number of codepoints in a UTF16 String buffer
1090      *
1091      * @param source UTF16 string buffer
1092      * @return number of codepoint in string
1093      * @stable ICU 2.1
1094      */
countCodePoint(StringBuffer source)1095     public static int countCodePoint(StringBuffer source) {
1096         if (source == null || source.length() == 0) {
1097             return 0;
1098         }
1099         return findCodePointOffset(source, source.length());
1100     }
1101 
1102     /**
1103      * Number of codepoints in a UTF16 char array substring
1104      *
1105      * @param source UTF16 char array
1106      * @param start Offset of the substring
1107      * @param limit Offset of the substring
1108      * @return number of codepoint in the substring
1109      * @exception IndexOutOfBoundsException If start and limit are not valid.
1110      * @stable ICU 2.1
1111      */
countCodePoint(char source[], int start, int limit)1112     public static int countCodePoint(char source[], int start, int limit) {
1113         if (source == null || source.length == 0) {
1114             return 0;
1115         }
1116         return findCodePointOffset(source, start, limit, limit - start);
1117     }
1118 
1119     /**
1120      * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
1121      * non-supplementary codepoint with a supplementary and vice versa.
1122      *
1123      * @param target Stringbuffer
1124      * @param offset16 UTF16 position to insert into
1125      * @param char32 Code point
1126      * @stable ICU 2.1
1127      */
setCharAt(StringBuffer target, int offset16, int char32)1128     public static void setCharAt(StringBuffer target, int offset16, int char32) {
1129         int count = 1;
1130         char single = target.charAt(offset16);
1131 
1132         if (isSurrogate(single)) {
1133             // pairs of the surrogate with offset16 at the lead char found
1134             if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
1135                     && isTrailSurrogate(target.charAt(offset16 + 1))) {
1136                 count++;
1137             } else {
1138                 // pairs of the surrogate with offset16 at the trail char
1139                 // found
1140                 if (isTrailSurrogate(single) && (offset16 > 0)
1141                         && isLeadSurrogate(target.charAt(offset16 - 1))) {
1142                     offset16--;
1143                     count++;
1144                 }
1145             }
1146         }
1147         target.replace(offset16, offset16 + count, valueOf(char32));
1148     }
1149 
1150     /**
1151      * Set a code point into a UTF16 position in a char array. Adjusts target according if we are
1152      * replacing a non-supplementary codepoint with a supplementary and vice versa.
1153      *
1154      * @param target char array
1155      * @param limit numbers of valid chars in target, different from target.length. limit counts the
1156      *            number of chars in target that represents a string, not the size of array target.
1157      * @param offset16 UTF16 position to insert into
1158      * @param char32 code point
1159      * @return new number of chars in target that represents a string
1160      * @exception IndexOutOfBoundsException if offset16 is out of range
1161      * @stable ICU 2.1
1162      */
setCharAt(char target[], int limit, int offset16, int char32)1163     public static int setCharAt(char target[], int limit, int offset16, int char32) {
1164         if (offset16 >= limit) {
1165             throw new ArrayIndexOutOfBoundsException(offset16);
1166         }
1167         int count = 1;
1168         char single = target[offset16];
1169 
1170         if (isSurrogate(single)) {
1171             // pairs of the surrogate with offset16 at the lead char found
1172             if (isLeadSurrogate(single) && (target.length > offset16 + 1)
1173                     && isTrailSurrogate(target[offset16 + 1])) {
1174                 count++;
1175             } else {
1176                 // pairs of the surrogate with offset16 at the trail char
1177                 // found
1178                 if (isTrailSurrogate(single) && (offset16 > 0)
1179                         && isLeadSurrogate(target[offset16 - 1])) {
1180                     offset16--;
1181                     count++;
1182                 }
1183             }
1184         }
1185 
1186         String str = valueOf(char32);
1187         int result = limit;
1188         int strlength = str.length();
1189         target[offset16] = str.charAt(0);
1190         if (count == strlength) {
1191             if (count == 2) {
1192                 target[offset16 + 1] = str.charAt(1);
1193             }
1194         } else {
1195             // this is not exact match in space, we'll have to do some
1196             // shifting
1197             System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit
1198                     - (offset16 + count));
1199             if (count < strlength) {
1200                 // char32 is a supplementary character trying to squeeze into
1201                 // a non-supplementary space
1202                 target[offset16 + 1] = str.charAt(1);
1203                 result++;
1204                 if (result < target.length) {
1205                     target[result] = 0;
1206                 }
1207             } else {
1208                 // char32 is a non-supplementary character trying to fill
1209                 // into a supplementary space
1210                 result--;
1211                 target[result] = 0;
1212             }
1213         }
1214         return result;
1215     }
1216 
1217     /**
1218      * Shifts offset16 by the argument number of codepoints
1219      *
1220      * @param source string
1221      * @param offset16 UTF16 position to shift
1222      * @param shift32 number of codepoints to shift
1223      * @return new shifted offset16
1224      * @exception IndexOutOfBoundsException if the new offset16 is out of bounds.
1225      * @stable ICU 2.1
1226      */
moveCodePointOffset(String source, int offset16, int shift32)1227     public static int moveCodePointOffset(String source, int offset16, int shift32) {
1228         int result = offset16;
1229         int size = source.length();
1230         int count;
1231         char ch;
1232         if (offset16 < 0 || offset16 > size) {
1233             throw new StringIndexOutOfBoundsException(offset16);
1234         }
1235         if (shift32 > 0) {
1236             if (shift32 + offset16 > size) {
1237                 throw new StringIndexOutOfBoundsException(offset16);
1238             }
1239             count = shift32;
1240             while (result < size && count > 0) {
1241                 ch = source.charAt(result);
1242                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1243                         && isTrailSurrogate(source.charAt(result + 1))) {
1244                     result++;
1245                 }
1246                 count--;
1247                 result++;
1248             }
1249         } else {
1250             if (offset16 + shift32 < 0) {
1251                 throw new StringIndexOutOfBoundsException(offset16);
1252             }
1253             for (count = -shift32; count > 0; count--) {
1254                 result--;
1255                 if (result < 0) {
1256                     break;
1257                 }
1258                 ch = source.charAt(result);
1259                 if (isTrailSurrogate(ch) && result > 0
1260                         && isLeadSurrogate(source.charAt(result - 1))) {
1261                     result--;
1262                 }
1263             }
1264         }
1265         if (count != 0) {
1266             throw new StringIndexOutOfBoundsException(shift32);
1267         }
1268         return result;
1269     }
1270 
1271     /**
1272      * Shifts offset16 by the argument number of codepoints
1273      *
1274      * @param source String buffer
1275      * @param offset16 UTF16 position to shift
1276      * @param shift32 Number of codepoints to shift
1277      * @return new shifted offset16
1278      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds.
1279      * @stable ICU 2.1
1280      */
moveCodePointOffset(StringBuffer source, int offset16, int shift32)1281     public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) {
1282         int result = offset16;
1283         int size = source.length();
1284         int count;
1285         char ch;
1286         if (offset16 < 0 || offset16 > size) {
1287             throw new StringIndexOutOfBoundsException(offset16);
1288         }
1289         if (shift32 > 0) {
1290             if (shift32 + offset16 > size) {
1291                 throw new StringIndexOutOfBoundsException(offset16);
1292             }
1293             count = shift32;
1294             while (result < size && count > 0) {
1295                 ch = source.charAt(result);
1296                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1297                         && isTrailSurrogate(source.charAt(result + 1))) {
1298                     result++;
1299                 }
1300                 count--;
1301                 result++;
1302             }
1303         } else {
1304             if (offset16 + shift32 < 0) {
1305                 throw new StringIndexOutOfBoundsException(offset16);
1306             }
1307             for (count = -shift32; count > 0; count--) {
1308                 result--;
1309                 if (result < 0) {
1310                     break;
1311                 }
1312                 ch = source.charAt(result);
1313                 if (isTrailSurrogate(ch) && result > 0
1314                         && isLeadSurrogate(source.charAt(result - 1))) {
1315                     result--;
1316                 }
1317             }
1318         }
1319         if (count != 0) {
1320             throw new StringIndexOutOfBoundsException(shift32);
1321         }
1322         return result;
1323     }
1324 
1325     /**
1326      * Shifts offset16 by the argument number of codepoints within a subarray.
1327      *
1328      * @param source Char array
1329      * @param start Position of the subarray to be performed on
1330      * @param limit Position of the subarray to be performed on
1331      * @param offset16 UTF16 position to shift relative to start
1332      * @param shift32 Number of codepoints to shift
1333      * @return new shifted offset16 relative to start
1334      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the
1335      *                subarray bounds are out of range.
1336      * @stable ICU 2.1
1337      */
moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32)1338     public static int moveCodePointOffset(char source[], int start, int limit, int offset16,
1339             int shift32) {
1340         int size = source.length;
1341         int count;
1342         char ch;
1343         int result = offset16 + start;
1344         if (start < 0 || limit < start) {
1345             throw new StringIndexOutOfBoundsException(start);
1346         }
1347         if (limit > size) {
1348             throw new StringIndexOutOfBoundsException(limit);
1349         }
1350         if (offset16 < 0 || result > limit) {
1351             throw new StringIndexOutOfBoundsException(offset16);
1352         }
1353         if (shift32 > 0) {
1354             if (shift32 + result > size) {
1355                 throw new StringIndexOutOfBoundsException(result);
1356             }
1357             count = shift32;
1358             while (result < limit && count > 0) {
1359                 ch = source[result];
1360                 if (isLeadSurrogate(ch) && (result + 1 < limit)
1361                         && isTrailSurrogate(source[result + 1])) {
1362                     result++;
1363                 }
1364                 count--;
1365                 result++;
1366             }
1367         } else {
1368             if (result + shift32 < start) {
1369                 throw new StringIndexOutOfBoundsException(result);
1370             }
1371             for (count = -shift32; count > 0; count--) {
1372                 result--;
1373                 if (result < start) {
1374                     break;
1375                 }
1376                 ch = source[result];
1377                 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
1378                     result--;
1379                 }
1380             }
1381         }
1382         if (count != 0) {
1383             throw new StringIndexOutOfBoundsException(shift32);
1384         }
1385         result -= start;
1386         return result;
1387     }
1388 
1389     /**
1390      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1391      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1392      * codepoint. The length of target increases by one if codepoint is non-supplementary, 2
1393      * otherwise.
1394      * <p>
1395      * The overall effect is exactly as if the argument were converted to a string by the method
1396      * valueOf(char) and the characters in that string were then inserted into target at the
1397      * position indicated by offset16.
1398      * </p>
1399      * <p>
1400      * The offset argument must be greater than or equal to 0, and less than or equal to the length
1401      * of source.
1402      *
1403      * @param target String buffer to insert to
1404      * @param offset16 Offset which char32 will be inserted in
1405      * @param char32 Codepoint to be inserted
1406      * @return a reference to target
1407      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1408      * @stable ICU 2.1
1409      */
insert(StringBuffer target, int offset16, int char32)1410     public static StringBuffer insert(StringBuffer target, int offset16, int char32) {
1411         String str = valueOf(char32);
1412         if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1413             offset16++;
1414         }
1415         target.insert(offset16, str);
1416         return target;
1417     }
1418 
1419     /**
1420      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1421      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1422      * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise.
1423      * <p>
1424      * The overall effect is exactly as if the argument were converted to a string by the method
1425      * valueOf(char) and the characters in that string were then inserted into target at the
1426      * position indicated by offset16.
1427      * </p>
1428      * <p>
1429      * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
1430      *
1431      * @param target Char array to insert to
1432      * @param limit End index of the char array, limit &lt;= target.length
1433      * @param offset16 Offset which char32 will be inserted in
1434      * @param char32 Codepoint to be inserted
1435      * @return new limit size
1436      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1437      * @stable ICU 2.1
1438      */
insert(char target[], int limit, int offset16, int char32)1439     public static int insert(char target[], int limit, int offset16, int char32) {
1440         String str = valueOf(char32);
1441         if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1442             offset16++;
1443         }
1444         int size = str.length();
1445         if (limit + size > target.length) {
1446             throw new ArrayIndexOutOfBoundsException(offset16 + size);
1447         }
1448         System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
1449         target[offset16] = str.charAt(0);
1450         if (size == 2) {
1451             target[offset16 + 1] = str.charAt(1);
1452         }
1453         return limit + size;
1454     }
1455 
1456     /**
1457      * Removes the codepoint at the specified position in this target (shortening target by 1
1458      * character if the codepoint is a non-supplementary, 2 otherwise).
1459      *
1460      * @param target String buffer to remove codepoint from
1461      * @param offset16 Offset which the codepoint will be removed
1462      * @return a reference to target
1463      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1464      * @stable ICU 2.1
1465      */
delete(StringBuffer target, int offset16)1466     public static StringBuffer delete(StringBuffer target, int offset16) {
1467         int count = 1;
1468         switch (bounds(target, offset16)) {
1469         case LEAD_SURROGATE_BOUNDARY:
1470             count++;
1471             break;
1472         case TRAIL_SURROGATE_BOUNDARY:
1473             count++;
1474             offset16--;
1475             break;
1476         }
1477         target.delete(offset16, offset16 + count);
1478         return target;
1479     }
1480 
1481     /**
1482      * Removes the codepoint at the specified position in this target (shortening target by 1
1483      * character if the codepoint is a non-supplementary, 2 otherwise).
1484      *
1485      * @param target String buffer to remove codepoint from
1486      * @param limit End index of the char array, limit &lt;= target.length
1487      * @param offset16 Offset which the codepoint will be removed
1488      * @return a new limit size
1489      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1490      * @stable ICU 2.1
1491      */
delete(char target[], int limit, int offset16)1492     public static int delete(char target[], int limit, int offset16) {
1493         int count = 1;
1494         switch (bounds(target, 0, limit, offset16)) {
1495         case LEAD_SURROGATE_BOUNDARY:
1496             count++;
1497             break;
1498         case TRAIL_SURROGATE_BOUNDARY:
1499             count++;
1500             offset16--;
1501             break;
1502         }
1503         System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
1504         target[limit - count] = 0;
1505         return limit - count;
1506     }
1507 
1508     /**
1509      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1510      * the argument codepoint. I.e., the smallest index <code>i</code> such that
1511      * <code>UTF16.charAt(source, i) ==
1512      * char32</code> is true.
1513      * <p>
1514      * If no such character occurs in this string, then -1 is returned.
1515      * </p>
1516      * <p>
1517      * Examples:<br>
1518      * UTF16.indexOf("abc", 'a') returns 0<br>
1519      * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1520      * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1521      * </p>
1522      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1523      * characters to its fullest.
1524      *
1525      * @param source UTF16 format Unicode string that will be searched
1526      * @param char32 Codepoint to search for
1527      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1528      *         -1 if the codepoint does not occur.
1529      * @stable ICU 2.6
1530      */
indexOf(String source, int char32)1531     public static int indexOf(String source, int char32) {
1532         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1533             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1534         }
1535         // non-surrogate bmp
1536         if (char32 < LEAD_SURROGATE_MIN_VALUE
1537                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1538             return source.indexOf((char) char32);
1539         }
1540         // surrogate
1541         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1542             int result = source.indexOf((char) char32);
1543             if (result >= 0) {
1544                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1545                         && isTrailSurrogate(source.charAt(result + 1))) {
1546                     return indexOf(source, char32, result + 1);
1547                 }
1548                 // trail surrogate
1549                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1550                     return indexOf(source, char32, result + 1);
1551                 }
1552             }
1553             return result;
1554         }
1555         // supplementary
1556         String char32str = toString(char32);
1557         return source.indexOf(char32str);
1558     }
1559 
1560     /**
1561      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1562      * the argument string str. This method is implemented based on codepoints, hence a "lead
1563      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1564      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1565      * character before str found at in source will not have a valid match. Vice versa for lead
1566      * surrogates that ends str. See example below.
1567      * <p>
1568      * If no such string str occurs in this source, then -1 is returned.
1569      * </p>
1570      * <p>
1571      * Examples:<br>
1572      * UTF16.indexOf("abc", "ab") returns 0<br>
1573      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1574      * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1575      * </p>
1576      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1577      * characters to its fullest.
1578      *
1579      * @param source UTF16 format Unicode string that will be searched
1580      * @param str UTF16 format Unicode string to search for
1581      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1582      *         -1 if the codepoint does not occur.
1583      * @stable ICU 2.6
1584      */
indexOf(String source, String str)1585     public static int indexOf(String source, String str) {
1586         int strLength = str.length();
1587         // non-surrogate ends
1588         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1589             return source.indexOf(str);
1590         }
1591 
1592         int result = source.indexOf(str);
1593         int resultEnd = result + strLength;
1594         if (result >= 0) {
1595             // check last character
1596             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1597                     && isTrailSurrogate(source.charAt(resultEnd + 1))) {
1598                 return indexOf(source, str, resultEnd + 1);
1599             }
1600             // check first character which is a trail surrogate
1601             if (isTrailSurrogate(str.charAt(0)) && result > 0
1602                     && isLeadSurrogate(source.charAt(result - 1))) {
1603                 return indexOf(source, str, resultEnd + 1);
1604             }
1605         }
1606         return result;
1607     }
1608 
1609     /**
1610      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1611      * the argument codepoint. I.e., the smallest index i such that: <br>
1612      * (UTF16.charAt(source, i) == char32 &amp;&amp; i &gt;= fromIndex) is true.
1613      * <p>
1614      * If no such character occurs in this string, then -1 is returned.
1615      * </p>
1616      * <p>
1617      * Examples:<br>
1618      * UTF16.indexOf("abc", 'a', 1) returns -1<br>
1619      * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
1620      * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
1621      * </p>
1622      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1623      * characters to its fullest.
1624      *
1625      * @param source UTF16 format Unicode string that will be searched
1626      * @param char32 Codepoint to search for
1627      * @param fromIndex The index to start the search from.
1628      * @return the index of the first occurrence of the codepoint in the argument Unicode string at
1629      *         or after fromIndex, or -1 if the codepoint does not occur.
1630      * @stable ICU 2.6
1631      */
indexOf(String source, int char32, int fromIndex)1632     public static int indexOf(String source, int char32, int fromIndex) {
1633         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1634             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1635         }
1636         // non-surrogate bmp
1637         if (char32 < LEAD_SURROGATE_MIN_VALUE
1638                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1639             return source.indexOf((char) char32, fromIndex);
1640         }
1641         // surrogate
1642         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1643             int result = source.indexOf((char) char32, fromIndex);
1644             if (result >= 0) {
1645                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1646                         && isTrailSurrogate(source.charAt(result + 1))) {
1647                     return indexOf(source, char32, result + 1);
1648                 }
1649                 // trail surrogate
1650                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1651                     return indexOf(source, char32, result + 1);
1652                 }
1653             }
1654             return result;
1655         }
1656         // supplementary
1657         String char32str = toString(char32);
1658         return source.indexOf(char32str, fromIndex);
1659     }
1660 
1661     /**
1662      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1663      * the argument string str. This method is implemented based on codepoints, hence a "lead
1664      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1665      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1666      * character before str found at in source will not have a valid match. Vice versa for lead
1667      * surrogates that ends str. See example below.
1668      * <p>
1669      * If no such string str occurs in this source, then -1 is returned.
1670      * </p>
1671      * <p>
1672      * Examples:<br>
1673      * UTF16.indexOf("abc", "ab", 0) returns 0<br>
1674      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
1675      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
1676      * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
1677      * </p>
1678      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1679      * characters to its fullest.
1680      *
1681      * @param source UTF16 format Unicode string that will be searched
1682      * @param str UTF16 format Unicode string to search for
1683      * @param fromIndex The index to start the search from.
1684      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1685      *         -1 if the codepoint does not occur.
1686      * @stable ICU 2.6
1687      */
indexOf(String source, String str, int fromIndex)1688     public static int indexOf(String source, String str, int fromIndex) {
1689         int strLength = str.length();
1690         // non-surrogate ends
1691         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1692             return source.indexOf(str, fromIndex);
1693         }
1694 
1695         int result = source.indexOf(str, fromIndex);
1696         int resultEnd = result + strLength;
1697         if (result >= 0) {
1698             // check last character
1699             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1700                     && isTrailSurrogate(source.charAt(resultEnd))) {
1701                 return indexOf(source, str, resultEnd + 1);
1702             }
1703             // check first character which is a trail surrogate
1704             if (isTrailSurrogate(str.charAt(0)) && result > 0
1705                     && isLeadSurrogate(source.charAt(result - 1))) {
1706                 return indexOf(source, str, resultEnd + 1);
1707             }
1708         }
1709         return result;
1710     }
1711 
1712     /**
1713      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1714      * the argument codepoint. I.e., the index returned is the largest value i such that:
1715      * UTF16.charAt(source, i) == char32 is true.
1716      * <p>
1717      * Examples:<br>
1718      * UTF16.lastIndexOf("abc", 'a') returns 0<br>
1719      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1720      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1721      * </p>
1722      * <p>
1723      * source is searched backwards starting at the last character.
1724      * </p>
1725      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1726      * characters to its fullest.
1727      *
1728      * @param source UTF16 format Unicode string that will be searched
1729      * @param char32 Codepoint to search for
1730      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1731      *         does not occur.
1732      * @stable ICU 2.6
1733      */
lastIndexOf(String source, int char32)1734     public static int lastIndexOf(String source, int char32) {
1735         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1736             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1737         }
1738         // non-surrogate bmp
1739         if (char32 < LEAD_SURROGATE_MIN_VALUE
1740                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1741             return source.lastIndexOf((char) char32);
1742         }
1743         // surrogate
1744         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1745             int result = source.lastIndexOf((char) char32);
1746             if (result >= 0) {
1747                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1748                         && isTrailSurrogate(source.charAt(result + 1))) {
1749                     return lastIndexOf(source, char32, result - 1);
1750                 }
1751                 // trail surrogate
1752                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1753                     return lastIndexOf(source, char32, result - 1);
1754                 }
1755             }
1756             return result;
1757         }
1758         // supplementary
1759         String char32str = toString(char32);
1760         return source.lastIndexOf(char32str);
1761     }
1762 
1763     /**
1764      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1765      * the argument string str. This method is implemented based on codepoints, hence a "lead
1766      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1767      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1768      * character before str found at in source will not have a valid match. Vice versa for lead
1769      * surrogates that ends str. See example below.
1770      * <p>
1771      * Examples:<br>
1772      * UTF16.lastIndexOf("abc", "a") returns 0<br>
1773      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1774      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1775      * </p>
1776      * <p>
1777      * source is searched backwards starting at the last character.
1778      * </p>
1779      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1780      * characters to its fullest.
1781      *
1782      * @param source UTF16 format Unicode string that will be searched
1783      * @param str UTF16 format Unicode string to search for
1784      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1785      *         does not occur.
1786      * @stable ICU 2.6
1787      */
lastIndexOf(String source, String str)1788     public static int lastIndexOf(String source, String str) {
1789         int strLength = str.length();
1790         // non-surrogate ends
1791         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1792             return source.lastIndexOf(str);
1793         }
1794 
1795         int result = source.lastIndexOf(str);
1796         if (result >= 0) {
1797             // check last character
1798             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1799                     && isTrailSurrogate(source.charAt(result + strLength + 1))) {
1800                 return lastIndexOf(source, str, result - 1);
1801             }
1802             // check first character which is a trail surrogate
1803             if (isTrailSurrogate(str.charAt(0)) && result > 0
1804                     && isLeadSurrogate(source.charAt(result - 1))) {
1805                 return lastIndexOf(source, str, result - 1);
1806             }
1807         }
1808         return result;
1809     }
1810 
1811     /**
1812      * <p>
1813      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1814      * the argument codepoint, where the result is less than or equals to fromIndex.
1815      * </p>
1816      * <p>
1817      * This method is implemented based on codepoints, hence a single surrogate character will not
1818      * match a supplementary character.
1819      * </p>
1820      * <p>
1821      * source is searched backwards starting at the last character starting at the specified index.
1822      * </p>
1823      * <p>
1824      * Examples:<br>
1825      * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
1826      * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
1827      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
1828      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
1829      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1830      * </p>
1831      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1832      * characters to its fullest.
1833      *
1834      * @param source UTF16 format Unicode string that will be searched
1835      * @param char32 Codepoint to search for
1836      * @param fromIndex the index to start the search from. There is no restriction on the value of
1837      *            fromIndex. If it is greater than or equal to the length of this string, it has the
1838      *            same effect as if it were equal to one less than the length of this string: this
1839      *            entire string may be searched. If it is negative, it has the same effect as if it
1840      *            were -1: -1 is returned.
1841      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1842      *         does not occur.
1843      * @stable ICU 2.6
1844      */
lastIndexOf(String source, int char32, int fromIndex)1845     public static int lastIndexOf(String source, int char32, int fromIndex) {
1846         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1847             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1848         }
1849         // non-surrogate bmp
1850         if (char32 < LEAD_SURROGATE_MIN_VALUE
1851                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1852             return source.lastIndexOf((char) char32, fromIndex);
1853         }
1854         // surrogate
1855         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1856             int result = source.lastIndexOf((char) char32, fromIndex);
1857             if (result >= 0) {
1858                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1859                         && isTrailSurrogate(source.charAt(result + 1))) {
1860                     return lastIndexOf(source, char32, result - 1);
1861                 }
1862                 // trail surrogate
1863                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1864                     return lastIndexOf(source, char32, result - 1);
1865                 }
1866             }
1867             return result;
1868         }
1869         // supplementary
1870         String char32str = toString(char32);
1871         return source.lastIndexOf(char32str, fromIndex);
1872     }
1873 
1874     /**
1875      * <p>
1876      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1877      * the argument string str, where the result is less than or equals to fromIndex.
1878      * </p>
1879      * <p>
1880      * This method is implemented based on codepoints, hence a "lead surrogate character + trail
1881      * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate
1882      * character at index 0, a source with a leading a surrogate character before str found at in
1883      * source will not have a valid match. Vice versa for lead surrogates that ends str.
1884      * </p>
1885      * See example below.
1886      * <p>
1887      * Examples:<br>
1888      * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
1889      * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
1890      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
1891      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
1892      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
1893      * </p>
1894      * <p>
1895      * source is searched backwards starting at the last character.
1896      * </p>
1897      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1898      * characters to its fullest.
1899      *
1900      * @param source UTF16 format Unicode string that will be searched
1901      * @param str UTF16 format Unicode string to search for
1902      * @param fromIndex the index to start the search from. There is no restriction on the value of
1903      *            fromIndex. If it is greater than or equal to the length of this string, it has the
1904      *            same effect as if it were equal to one less than the length of this string: this
1905      *            entire string may be searched. If it is negative, it has the same effect as if it
1906      *            were -1: -1 is returned.
1907      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1908      *         does not occur.
1909      * @stable ICU 2.6
1910      */
lastIndexOf(String source, String str, int fromIndex)1911     public static int lastIndexOf(String source, String str, int fromIndex) {
1912         int strLength = str.length();
1913         // non-surrogate ends
1914         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1915             return source.lastIndexOf(str, fromIndex);
1916         }
1917 
1918         int result = source.lastIndexOf(str, fromIndex);
1919         if (result >= 0) {
1920             // check last character
1921             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1922                     && isTrailSurrogate(source.charAt(result + strLength))) {
1923                 return lastIndexOf(source, str, result - 1);
1924             }
1925             // check first character which is a trail surrogate
1926             if (isTrailSurrogate(str.charAt(0)) && result > 0
1927                     && isLeadSurrogate(source.charAt(result - 1))) {
1928                 return lastIndexOf(source, str, result - 1);
1929             }
1930         }
1931         return result;
1932     }
1933 
1934     /**
1935      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of
1936      * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16
1937      * format Unicode string source, then source will be returned. Otherwise, a new String object is
1938      * created that represents a codepoint sequence identical to the codepoint sequence represented
1939      * by source, except that every occurrence of oldChar32 is replaced by an occurrence of
1940      * newChar32.
1941      * <p>
1942      * Examples: <br>
1943      * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
1944      * returns "mosquito in your collar"<br>
1945      * UTF16.replace("JonL", 'q', 'x');<br>
1946      * returns "JonL" (no change)<br>
1947      * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br>
1948      * returns "Supplementary character !"<br>
1949      * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br>
1950      * returns "Supplementary character \ud800\udc00"<br>
1951      * </p>
1952      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1953      * characters to its fullest.
1954      *
1955      * @param source UTF16 format Unicode string which the codepoint replacements will be based on.
1956      * @param oldChar32 Non-zero old codepoint to be replaced.
1957      * @param newChar32 The new codepoint to replace oldChar32
1958      * @return new String derived from source by replacing every occurrence of oldChar32 with
1959      *         newChar32, unless when no oldChar32 is found in source then source will be returned.
1960      * @stable ICU 2.6
1961      */
replace(String source, int oldChar32, int newChar32)1962     public static String replace(String source, int oldChar32, int newChar32) {
1963         if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
1964             throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint");
1965         }
1966         if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
1967             throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint");
1968         }
1969 
1970         int index = indexOf(source, oldChar32);
1971         if (index == -1) {
1972             return source;
1973         }
1974         String newChar32Str = toString(newChar32);
1975         int oldChar32Size = 1;
1976         int newChar32Size = newChar32Str.length();
1977         StringBuffer result = new StringBuffer(source);
1978         int resultIndex = index;
1979 
1980         if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
1981             oldChar32Size = 2;
1982         }
1983 
1984         while (index != -1) {
1985             int endResultIndex = resultIndex + oldChar32Size;
1986             result.replace(resultIndex, endResultIndex, newChar32Str);
1987             int lastEndIndex = index + oldChar32Size;
1988             index = indexOf(source, oldChar32, lastEndIndex);
1989             resultIndex += newChar32Size + index - lastEndIndex;
1990         }
1991         return result.toString();
1992     }
1993 
1994     /**
1995      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr
1996      * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string
1997      * source, then source will be returned. Otherwise, a new String object is created that
1998      * represents a codepoint sequence identical to the codepoint sequence represented by source,
1999      * except that every occurrence of oldStr is replaced by an occurrence of newStr.
2000      * <p>
2001      * Examples: <br>
2002      * UTF16.replace("mesquite in your cellar", "e", "o");<br>
2003      * returns "mosquito in your collar"<br>
2004      * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
2005      * returns "cat in your cellar"<br>
2006      * UTF16.replace("JonL", "q", "x");<br>
2007      * returns "JonL" (no change)<br>
2008      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br>
2009      * returns "Supplementary character !"<br>
2010      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br>
2011      * returns "Supplementary character \ud800\udc00"<br>
2012      * </p>
2013      * Note this method is provided as support to jdk 1.3, which does not support supplementary
2014      * characters to its fullest.
2015      *
2016      * @param source UTF16 format Unicode string which the replacements will be based on.
2017      * @param oldStr Non-zero-length string to be replaced.
2018      * @param newStr The new string to replace oldStr
2019      * @return new String derived from source by replacing every occurrence of oldStr with newStr.
2020      *         When no oldStr is found in source, then source will be returned.
2021      * @stable ICU 2.6
2022      */
replace(String source, String oldStr, String newStr)2023     public static String replace(String source, String oldStr, String newStr) {
2024         int index = indexOf(source, oldStr);
2025         if (index == -1) {
2026             return source;
2027         }
2028         int oldStrSize = oldStr.length();
2029         int newStrSize = newStr.length();
2030         StringBuffer result = new StringBuffer(source);
2031         int resultIndex = index;
2032 
2033         while (index != -1) {
2034             int endResultIndex = resultIndex + oldStrSize;
2035             result.replace(resultIndex, endResultIndex, newStr);
2036             int lastEndIndex = index + oldStrSize;
2037             index = indexOf(source, oldStr, lastEndIndex);
2038             resultIndex += newStrSize + index - lastEndIndex;
2039         }
2040         return result.toString();
2041     }
2042 
2043     /**
2044      * Reverses a UTF16 format Unicode string and replaces source's content with it. This method
2045      * will reverse surrogate characters correctly, instead of blindly reversing every character.
2046      * <p>
2047      * Examples:<br>
2048      * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
2049      * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
2050      *
2051      * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed
2052      * @return a modified source with reversed UTF16 format Unicode string.
2053      * @stable ICU 2.6
2054      */
reverse(StringBuffer source)2055     public static StringBuffer reverse(StringBuffer source) {
2056         int length = source.length();
2057         StringBuffer result = new StringBuffer(length);
2058         for (int i = length; i-- > 0;) {
2059             char ch = source.charAt(i);
2060             if (isTrailSurrogate(ch) && i > 0) {
2061                 char ch2 = source.charAt(i - 1);
2062                 if (isLeadSurrogate(ch2)) {
2063                     result.append(ch2);
2064                     result.append(ch);
2065                     --i;
2066                     continue;
2067                 }
2068             }
2069             result.append(ch);
2070         }
2071         return result;
2072     }
2073 
2074     /**
2075      * Check if the string contains more Unicode code points than a certain number. This is more
2076      * efficient than counting all code points in the entire string and comparing that number with a
2077      * threshold. This function may not need to scan the string at all if the length is within a
2078      * certain range, and never needs to count more than 'number + 1' code points. Logically
2079      * equivalent to (countCodePoint(s) &gt; number). A Unicode code point may occupy either one or two
2080      * code units.
2081      *
2082      * @param source The input string.
2083      * @param number The number of code points in the string is compared against the 'number'
2084      *            parameter.
2085      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2086      * @stable ICU 2.4
2087      */
hasMoreCodePointsThan(String source, int number)2088     public static boolean hasMoreCodePointsThan(String source, int number) {
2089         if (number < 0) {
2090             return true;
2091         }
2092         if (source == null) {
2093             return false;
2094         }
2095         int length = source.length();
2096 
2097         // length >= 0 known
2098         // source contains at least (length + 1) / 2 code points: <= 2
2099         // chars per cp
2100         if (((length + 1) >> 1) > number) {
2101             return true;
2102         }
2103 
2104         // check if source does not even contain enough chars
2105         int maxsupplementary = length - number;
2106         if (maxsupplementary <= 0) {
2107             return false;
2108         }
2109 
2110         // there are maxsupplementary = length - number more chars than
2111         // asked-for code points
2112 
2113         // count code points until they exceed and also check that there are
2114         // no more than maxsupplementary supplementary code points (char pairs)
2115         int start = 0;
2116         while (true) {
2117             if (length == 0) {
2118                 return false;
2119             }
2120             if (number == 0) {
2121                 return true;
2122             }
2123             if (isLeadSurrogate(source.charAt(start++)) && start != length
2124                     && isTrailSurrogate(source.charAt(start))) {
2125                 start++;
2126                 if (--maxsupplementary <= 0) {
2127                     // too many pairs - too few code points
2128                     return false;
2129                 }
2130             }
2131             --number;
2132         }
2133     }
2134 
2135     /**
2136      * Check if the sub-range of char array, from argument start to limit, contains more Unicode
2137      * code points than a certain number. This is more efficient than counting all code points in
2138      * the entire char array range and comparing that number with a threshold. This function may not
2139      * need to scan the char array at all if start and limit is within a certain range, and never
2140      * needs to count more than 'number + 1' code points. Logically equivalent to
2141      * (countCodePoint(source, start, limit) &gt; number). A Unicode code point may occupy either one
2142      * or two code units.
2143      *
2144      * @param source Array of UTF-16 chars
2145      * @param start Offset to substring in the source array for analyzing
2146      * @param limit Offset to substring in the source array for analyzing
2147      * @param number The number of code points in the string is compared against the 'number'
2148      *            parameter.
2149      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2150      * @exception IndexOutOfBoundsException Thrown when limit &lt; start
2151      * @stable ICU 2.4
2152      */
hasMoreCodePointsThan(char source[], int start, int limit, int number)2153     public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
2154         int length = limit - start;
2155         if (length < 0 || start < 0 || limit < 0) {
2156             throw new IndexOutOfBoundsException(
2157                     "Start and limit indexes should be non-negative and start <= limit");
2158         }
2159         if (number < 0) {
2160             return true;
2161         }
2162         if (source == null) {
2163             return false;
2164         }
2165 
2166         // length >= 0 known
2167         // source contains at least (length + 1) / 2 code points: <= 2
2168         // chars per cp
2169         if (((length + 1) >> 1) > number) {
2170             return true;
2171         }
2172 
2173         // check if source does not even contain enough chars
2174         int maxsupplementary = length - number;
2175         if (maxsupplementary <= 0) {
2176             return false;
2177         }
2178 
2179         // there are maxsupplementary = length - number more chars than
2180         // asked-for code points
2181 
2182         // count code points until they exceed and also check that there are
2183         // no more than maxsupplementary supplementary code points (char pairs)
2184         while (true) {
2185             if (length == 0) {
2186                 return false;
2187             }
2188             if (number == 0) {
2189                 return true;
2190             }
2191             if (isLeadSurrogate(source[start++]) && start != limit
2192                     && isTrailSurrogate(source[start])) {
2193                 start++;
2194                 if (--maxsupplementary <= 0) {
2195                     // too many pairs - too few code points
2196                     return false;
2197                 }
2198             }
2199             --number;
2200         }
2201     }
2202 
2203     /**
2204      * Check if the string buffer contains more Unicode code points than a certain number. This is
2205      * more efficient than counting all code points in the entire string buffer and comparing that
2206      * number with a threshold. This function may not need to scan the string buffer at all if the
2207      * length is within a certain range, and never needs to count more than 'number + 1' code
2208      * points. Logically equivalent to (countCodePoint(s) &gt; number). A Unicode code point may
2209      * occupy either one or two code units.
2210      *
2211      * @param source The input string buffer.
2212      * @param number The number of code points in the string buffer is compared against the 'number'
2213      *            parameter.
2214      * @return boolean value for whether the string buffer contains more Unicode code points than
2215      *         'number'.
2216      * @stable ICU 2.4
2217      */
hasMoreCodePointsThan(StringBuffer source, int number)2218     public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
2219         if (number < 0) {
2220             return true;
2221         }
2222         if (source == null) {
2223             return false;
2224         }
2225         int length = source.length();
2226 
2227         // length >= 0 known
2228         // source contains at least (length + 1) / 2 code points: <= 2
2229         // chars per cp
2230         if (((length + 1) >> 1) > number) {
2231             return true;
2232         }
2233 
2234         // check if source does not even contain enough chars
2235         int maxsupplementary = length - number;
2236         if (maxsupplementary <= 0) {
2237             return false;
2238         }
2239 
2240         // there are maxsupplementary = length - number more chars than
2241         // asked-for code points
2242 
2243         // count code points until they exceed and also check that there are
2244         // no more than maxsupplementary supplementary code points (char pairs)
2245         int start = 0;
2246         while (true) {
2247             if (length == 0) {
2248                 return false;
2249             }
2250             if (number == 0) {
2251                 return true;
2252             }
2253             if (isLeadSurrogate(source.charAt(start++)) && start != length
2254                     && isTrailSurrogate(source.charAt(start))) {
2255                 start++;
2256                 if (--maxsupplementary <= 0) {
2257                     // too many pairs - too few code points
2258                     return false;
2259                 }
2260             }
2261             --number;
2262         }
2263     }
2264 
2265     /**
2266      * Cover JDK 1.5 API. Create a String from an array of codePoints.
2267      *
2268      * @param codePoints The code array
2269      * @param offset The start of the text in the code point array
2270      * @param count The number of code points
2271      * @return a String representing the code points between offset and count
2272      * @throws IllegalArgumentException If an invalid code point is encountered
2273      * @throws IndexOutOfBoundsException If the offset or count are out of bounds.
2274      * @stable ICU 3.0
2275      */
newString(int[] codePoints, int offset, int count)2276     public static String newString(int[] codePoints, int offset, int count) {
2277         if (count < 0) {
2278             throw new IllegalArgumentException();
2279         }
2280         char[] chars = new char[count];
2281         int w = 0;
2282         for (int r = offset, e = offset + count; r < e; ++r) {
2283             int cp = codePoints[r];
2284             if (cp < 0 || cp > 0x10ffff) {
2285                 throw new IllegalArgumentException();
2286             }
2287             while (true) {
2288                 try {
2289                     if (cp < 0x010000) {
2290                         chars[w] = (char) cp;
2291                         w++;
2292                     } else {
2293                         chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
2294                         chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
2295                         w += 2;
2296                     }
2297                     break;
2298                 } catch (IndexOutOfBoundsException ex) {
2299                     int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
2300                             / (r - offset + 1)));
2301                     char[] temp = new char[newlen];
2302                     System.arraycopy(chars, 0, temp, 0, w);
2303                     chars = temp;
2304                 }
2305             }
2306         }
2307         return new String(chars, 0, w);
2308     }
2309 
2310     /**
2311      * <p>
2312      * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various
2313      * modes
2314      * </p>
2315      * <ul>
2316      * <li> Code point comparison or code unit comparison
2317      * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison
2318      * with special handling for character 'i'.
2319      * </ul>
2320      * <p>
2321      * The code unit or code point comparison differ only when comparing supplementary code points
2322      * (&#92;u10000..&#92;u10ffff) to BMP code points near the end of the BMP (i.e.,
2323      * &#92;ue000..&#92;uffff). In code unit comparison, high BMP code points sort after
2324      * supplementary code points because they are stored as pairs of surrogates which are at
2325      * &#92;ud800..&#92;udfff.
2326      * </p>
2327      *
2328      * @see #FOLD_CASE_DEFAULT
2329      * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2330      * @stable ICU 2.1
2331      */
2332     public static final class StringComparator implements java.util.Comparator<String> {
2333         // public constructor ------------------------------------------------
2334 
2335         /**
2336          * Default constructor that does code unit comparison and case sensitive comparison.
2337          *
2338          * @stable ICU 2.1
2339          */
StringComparator()2340         public StringComparator() {
2341             this(false, false, FOLD_CASE_DEFAULT);
2342         }
2343 
2344         /**
2345          * Constructor that does comparison based on the argument options.
2346          *
2347          * @param codepointcompare Flag to indicate true for code point comparison or false for code unit
2348          *            comparison.
2349          * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison
2350          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2351          *            when ignorecase is set to true. If ignorecase is false, this option is
2352          *            ignored.
2353          * @see #FOLD_CASE_DEFAULT
2354          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2355          * @throws IllegalArgumentException If foldcaseoption is out of range
2356          * @stable ICU 2.4
2357          */
StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption)2358         public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) {
2359             setCodePointCompare(codepointcompare);
2360             m_ignoreCase_ = ignorecase;
2361             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2362                 throw new IllegalArgumentException("Invalid fold case option");
2363             }
2364             m_foldCase_ = foldcaseoption;
2365         }
2366 
2367         // public data member ------------------------------------------------
2368 
2369         /**
2370          * Option value for case folding comparison:
2371          *
2372          * <p>Comparison is case insensitive, strings are folded using default mappings defined in
2373          * Unicode data file CaseFolding.txt, before comparison.
2374          *
2375          * @stable ICU 2.4
2376          */
2377         public static final int FOLD_CASE_DEFAULT = 0;
2378 
2379         /**
2380          * Option value for case folding:
2381          * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
2382          * and dotless i appropriately for Turkic languages (tr, az).
2383          *
2384          * <p>Comparison is case insensitive, strings are folded using modified mappings defined in
2385          * Unicode data file CaseFolding.txt, before comparison.
2386          *
2387          * @stable ICU 2.4
2388          * @see com.ibm.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
2389          */
2390         public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
2391 
2392         // public methods ----------------------------------------------------
2393 
2394         // public setters ----------------------------------------------------
2395 
2396         /**
2397          * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode
2398          * is set to code unit compare
2399          *
2400          * @param flag True for code point compare, false for code unit compare
2401          * @stable ICU 2.4
2402          */
setCodePointCompare(boolean flag)2403         public void setCodePointCompare(boolean flag) {
2404             if (flag) {
2405                 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
2406             } else {
2407                 m_codePointCompare_ = 0;
2408             }
2409         }
2410 
2411         /**
2412          * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise
2413          * case sensitive comparison mode if set to false.
2414          *
2415          * @param ignorecase True for case-insitive comparison, false for case sensitive comparison
2416          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2417          *            when ignorecase is set to true. If ignorecase is false, this option is
2418          *            ignored.
2419          * @see #FOLD_CASE_DEFAULT
2420          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2421          * @stable ICU 2.4
2422          */
setIgnoreCase(boolean ignorecase, int foldcaseoption)2423         public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
2424             m_ignoreCase_ = ignorecase;
2425             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2426                 throw new IllegalArgumentException("Invalid fold case option");
2427             }
2428             m_foldCase_ = foldcaseoption;
2429         }
2430 
2431         // public getters ----------------------------------------------------
2432 
2433         /**
2434          * Checks if the comparison mode is code point compare.
2435          *
2436          * @return true for code point compare, false for code unit compare
2437          * @stable ICU 2.4
2438          */
getCodePointCompare()2439         public boolean getCodePointCompare() {
2440             return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2441         }
2442 
2443         /**
2444          * Checks if Comparator is in the case insensitive mode.
2445          *
2446          * @return true if Comparator performs case insensitive comparison, false otherwise
2447          * @stable ICU 2.4
2448          */
getIgnoreCase()2449         public boolean getIgnoreCase() {
2450             return m_ignoreCase_;
2451         }
2452 
2453         /**
2454          * Gets the fold case options set in Comparator to be used with case insensitive comparison.
2455          *
2456          * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
2457          * @see #FOLD_CASE_DEFAULT
2458          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2459          * @stable ICU 2.4
2460          */
getIgnoreCaseOption()2461         public int getIgnoreCaseOption() {
2462             return m_foldCase_;
2463         }
2464 
2465         // public other methods ----------------------------------------------
2466 
2467         /**
2468          * Compare two strings depending on the options selected during construction.
2469          *
2470          * @param a first source string.
2471          * @param b second source string.
2472          * @return 0 returned if a == b. If a &lt; b, a negative value is returned. Otherwise if a &gt; b,
2473          *         a positive value is returned.
2474          * @exception ClassCastException thrown when either a or b is not a String object
2475          * @stable ICU 4.4
2476          */
compare(String a, String b)2477         public int compare(String a, String b) {
2478             if (a == b) {
2479                 return 0;
2480             }
2481             if (a == null) {
2482                 return -1;
2483             }
2484             if (b == null) {
2485                 return 1;
2486             }
2487 
2488             if (m_ignoreCase_) {
2489                 return compareCaseInsensitive(a, b);
2490             }
2491             return compareCaseSensitive(a, b);
2492         }
2493 
2494         // private data member ----------------------------------------------
2495 
2496         /**
2497          * Code unit comparison flag. True if code unit comparison is required. False if code point
2498          * comparison is required.
2499          */
2500         private int m_codePointCompare_;
2501 
2502         /**
2503          * Fold case comparison option.
2504          */
2505         private int m_foldCase_;
2506 
2507         /**
2508          * Flag indicator if ignore case is to be used during comparison
2509          */
2510         private boolean m_ignoreCase_;
2511 
2512         /**
2513          * Code point order offset for surrogate characters
2514          */
2515         private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
2516 
2517         // private method ---------------------------------------------------
2518 
2519         /**
2520          * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life
2521          * easier.
2522          *
2523          * @param s1
2524          *            first string to compare
2525          * @param s2
2526          *            second string to compare
2527          * @return -1 is s1 &lt; s2, 0 if equals,
2528          */
compareCaseInsensitive(String s1, String s2)2529         private int compareCaseInsensitive(String s1, String s2) {
2530             return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_
2531                     | Normalizer.COMPARE_IGNORE_CASE);
2532         }
2533 
2534         /**
2535          * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life
2536          * easier.
2537          *
2538          * @param s1
2539          *            first string to compare
2540          * @param s2
2541          *            second string to compare
2542          * @return -1 is s1 &lt; s2, 0 if equals,
2543          */
compareCaseSensitive(String s1, String s2)2544         private int compareCaseSensitive(String s1, String s2) {
2545             // compare identical prefixes - they do not need to be fixed up
2546             // limit1 = start1 + min(lenght1, length2)
2547             int length1 = s1.length();
2548             int length2 = s2.length();
2549             int minlength = length1;
2550             int result = 0;
2551             if (length1 < length2) {
2552                 result = -1;
2553             } else if (length1 > length2) {
2554                 result = 1;
2555                 minlength = length2;
2556             }
2557 
2558             char c1 = 0;
2559             char c2 = 0;
2560             int index = 0;
2561             for (; index < minlength; index++) {
2562                 c1 = s1.charAt(index);
2563                 c2 = s2.charAt(index);
2564                 // check pseudo-limit
2565                 if (c1 != c2) {
2566                     break;
2567                 }
2568             }
2569 
2570             if (index == minlength) {
2571                 return result;
2572             }
2573 
2574             boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2575             // if both values are in or above the surrogate range, fix them up
2576             if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE
2577                     && codepointcompare) {
2578                 // subtract 0x2800 from BMP code points to make them smaller
2579                 // than supplementary ones
2580                 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1)))
2581                         || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) {
2582                     // part of a surrogate pair, leave >=d800
2583                 } else {
2584                     // BMP code point - may be surrogate code point - make
2585                     // < d800
2586                     c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2587                 }
2588 
2589                 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1)))
2590                         || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) {
2591                     // part of a surrogate pair, leave >=d800
2592                 } else {
2593                     // BMP code point - may be surrogate code point - make <d800
2594                     c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2595                 }
2596             }
2597 
2598             // now c1 and c2 are in UTF-32-compatible order
2599             return c1 - c2;
2600         }
2601     }
2602 
2603     /**
2604      * Utility for getting a code point from a CharSequence that contains exactly one code point.
2605      * @return the code point IF the string is non-null and consists of a single code point.
2606      * otherwise returns -1.
2607      * @param s to test
2608      * @stable ICU 54
2609      */
getSingleCodePoint(CharSequence s)2610     public static int getSingleCodePoint(CharSequence s) {
2611         if (s == null || s.length() == 0) {
2612             return -1;
2613         } else if (s.length() == 1) {
2614             return s.charAt(0);
2615         } else if (s.length() > 2) {
2616             return -1;
2617         }
2618 
2619         // at this point, len = 2
2620         int cp = Character.codePointAt(s, 0);
2621         if (cp > 0xFFFF) { // is surrogate pair
2622             return cp;
2623         }
2624         return -1;
2625     }
2626 
2627     /**
2628      * Utility for comparing a code point to a string without having to create a new string. Returns the same results
2629      * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if
2630      * <pre>
2631      * sc = new StringComparator(true,false,0);
2632      * fast = UTF16.compareCodePoint(codePoint, charSequence)
2633      * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString())
2634      * </pre>
2635      * then
2636      * <pre>
2637      * Integer.signum(fast) == Integer.signum(slower)
2638      * </pre>
2639      * @param codePoint to test
2640      * @param s to test
2641      * @return equivalent of code point comparator comparing two strings.
2642      * @stable ICU 54
2643      */
compareCodePoint(int codePoint, CharSequence s)2644     public static int compareCodePoint(int codePoint, CharSequence s) {
2645         if (s == null) {
2646             return 1;
2647         }
2648         final int strLen = s.length();
2649         if (strLen == 0) {
2650             return 1;
2651         }
2652         int second = Character.codePointAt(s, 0);
2653         int diff = codePoint - second;
2654         if (diff != 0) {
2655             return diff;
2656         }
2657         return strLen == Character.charCount(codePoint) ? 0 : -1;
2658     }
2659 
2660     // private data members -------------------------------------------------
2661 
2662     /**
2663      * Shift value for lead surrogate to form a supplementary character.
2664      */
2665     private static final int LEAD_SURROGATE_SHIFT_ = 10;
2666 
2667     /**
2668      * Mask to retrieve the significant value from a trail surrogate.
2669      */
2670     private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
2671 
2672     /**
2673      * Value that all lead surrogate starts with
2674      */
2675     private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
2676             - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
2677 
2678     // private methods ------------------------------------------------------
2679 
2680     /**
2681      * <p>
2682      * Converts argument code point and returns a String object representing the code point's value
2683      * in UTF16 format.
2684      * </p>
2685      * <p>
2686      * This method does not check for the validity of the codepoint, the results are not guaranteed
2687      * if a invalid codepoint is passed as argument.
2688      * </p>
2689      * <p>
2690      * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise.
2691      * </p>
2692      *
2693      * @param ch
2694      *            code point
2695      * @return string representation of the code point
2696      */
toString(int ch)2697     private static String toString(int ch) {
2698         if (ch < SUPPLEMENTARY_MIN_VALUE) {
2699             return String.valueOf((char) ch);
2700         }
2701 
2702         StringBuilder result = new StringBuilder();
2703         result.append(getLeadSurrogate(ch));
2704         result.append(getTrailSurrogate(ch));
2705         return result.toString();
2706     }
2707 }
2708 // eof
2709