• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 1996-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.text;
11 
12 import com.ibm.icu.impl.Utility;
13 
14 /**
15  * <p>
16  * Standalone utility class providing UTF16 character conversions and indexing conversions.
17  * </p>
18  * <p>
19  * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
20  * so searching for strings is a safe operation. Similarly, concatenation is always safe.
21  * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
22  * values for start and end are on those boundaries, since they arose from operations like
23  * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
24  * </p>
25  * <strong>Examples:</strong>
26  * <p>
27  * The following examples illustrate use of some of these methods.
28  *
29  * <pre>
30  * // iteration forwards: Original
31  * for (int i = 0; i &lt; s.length(); ++i) {
32  *     char ch = s.charAt(i);
33  *     doSomethingWith(ch);
34  * }
35  *
36  * // iteration forwards: Changes for UTF-32
37  * int ch;
38  * for (int i = 0; i &lt; s.length(); i += UTF16.getCharCount(ch)) {
39  *     ch = UTF16.charAt(s, i);
40  *     doSomethingWith(ch);
41  * }
42  *
43  * // iteration backwards: Original
44  * for (int i = s.length() - 1; i &gt;= 0; --i) {
45  *     char ch = s.charAt(i);
46  *     doSomethingWith(ch);
47  * }
48  *
49  * // iteration backwards: Changes for UTF-32
50  * int ch;
51  * for (int i = s.length() - 1; i &gt; 0; i -= UTF16.getCharCount(ch)) {
52  *     ch = UTF16.charAt(s, i);
53  *     doSomethingWith(ch);
54  * }
55  * </pre>
56  *
57  * <strong>Notes:</strong>
58  * <ul>
59  * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code>
60  * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string.
61  * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16
62  * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32
63  * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li>
64  * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a
65  * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16
66  * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
67  * </li>
68  * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out
69  * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates
70  * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to
71  * check for validity if desired. </li>
72  * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then
73  * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It
74  * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4,
75  * 5.5). </li>
76  * <li> <strong>Optimization:</strong> The method implementations may need optimization if the
77  * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small
78  * percentage of all the text in the world, the singleton case should always be optimized for. </li>
79  * </ul>
80  *
81  * @author Mark Davis, with help from Markus Scherer
82  * @stable ICU 2.1
83  */
84 
85 public final class UTF16 {
86     // public variables ---------------------------------------------------
87 
88     /**
89      * Value returned in {@link #bounds(String, int) bounds()}.
90      * These values are chosen specifically so that it actually represents the position of the
91      * character [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)]
92      *
93      * @stable ICU 2.1
94      */
95     public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
96             TRAIL_SURROGATE_BOUNDARY = 5;
97 
98     /**
99      * The lowest Unicode code point value.
100      *
101      * @stable ICU 2.1
102      */
103     public static final int CODEPOINT_MIN_VALUE = 0;
104 
105     /**
106      * The highest Unicode code point value (scalar value) according to the Unicode Standard.
107      *
108      * @stable ICU 2.1
109      */
110     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
111 
112     /**
113      * The minimum value for Supplementary code points
114      *
115      * @stable ICU 2.1
116      */
117     public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
118 
119     /**
120      * Lead surrogate minimum value
121      *
122      * @stable ICU 2.1
123      */
124     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
125 
126     /**
127      * Trail surrogate minimum value
128      *
129      * @stable ICU 2.1
130      */
131     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
132 
133     /**
134      * Lead surrogate maximum value
135      *
136      * @stable ICU 2.1
137      */
138     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
139 
140     /**
141      * Trail surrogate maximum value
142      *
143      * @stable ICU 2.1
144      */
145     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
146 
147     /**
148      * Surrogate minimum value
149      *
150      * @stable ICU 2.1
151      */
152     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
153 
154     /**
155      * Maximum surrogate value
156      *
157      * @stable ICU 2.1
158      */
159     public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
160 
161     /**
162      * Lead surrogate bitmask
163      */
164     private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
165 
166     /**
167      * Trail surrogate bitmask
168      */
169     private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
170 
171     /**
172      * Surrogate bitmask
173      */
174     private static final int SURROGATE_BITMASK = 0xFFFFF800;
175 
176     /**
177      * Lead surrogate bits
178      */
179     private static final int LEAD_SURROGATE_BITS = 0xD800;
180 
181     /**
182      * Trail surrogate bits
183      */
184     private static final int TRAIL_SURROGATE_BITS = 0xDC00;
185 
186     /**
187      * Surrogate bits
188      */
189     private static final int SURROGATE_BITS = 0xD800;
190 
191     // constructor --------------------------------------------------------
192 
193     // /CLOVER:OFF
194     /**
195      * Prevent instance from being created.
196      */
UTF16()197     private UTF16() {
198     }
199 
200     // /CLOVER:ON
201     // public method ------------------------------------------------------
202 
203     /**
204      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
205      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
206      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
207      * UCharacter.isLegal()</a></code>
208      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
209      * character will be returned. If a complete supplementary character is not found the incomplete
210      * character will be returned
211      *
212      * @param source Array of UTF-16 chars
213      * @param offset16 UTF-16 offset to the start of the character.
214      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
215      *         of that codepoint are the same as in <code>bounds32()</code>.
216      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
217      * @stable ICU 2.1
218      */
charAt(String source, int offset16)219     public static int charAt(String source, int offset16) {
220         char single = source.charAt(offset16);
221         if (single < LEAD_SURROGATE_MIN_VALUE) {
222             return single;
223         }
224         return _charAt(source, offset16, single);
225     }
226 
_charAt(String source, int offset16, char single)227     private static int _charAt(String source, int offset16, char single) {
228         if (single > TRAIL_SURROGATE_MAX_VALUE) {
229             return single;
230         }
231 
232         // Convert the UTF-16 surrogate pair if necessary.
233         // For simplicity in usage, and because the frequency of pairs is
234         // low, look both directions.
235 
236         if (single <= LEAD_SURROGATE_MAX_VALUE) {
237             ++offset16;
238             if (source.length() != offset16) {
239                 char trail = source.charAt(offset16);
240                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
241                     return Character.toCodePoint(single, trail);
242                 }
243             }
244         } else {
245             --offset16;
246             if (offset16 >= 0) {
247                 // single is a trail surrogate so
248                 char lead = source.charAt(offset16);
249                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
250                     return Character.toCodePoint(lead, single);
251                 }
252             }
253         }
254         return single; // return unmatched surrogate
255     }
256 
257     /**
258      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
259      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
260      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
261      * UCharacter.isLegal()</a></code>
262      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
263      * character will be returned. If a complete supplementary character is not found the incomplete
264      * character will be returned
265      *
266      * @param source Array of UTF-16 chars
267      * @param offset16 UTF-16 offset to the start of the character.
268      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
269      *         of that codepoint are the same as in <code>bounds32()</code>.
270      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
271      * @stable ICU 2.1
272      */
charAt(CharSequence source, int offset16)273     public static int charAt(CharSequence source, int offset16) {
274         char single = source.charAt(offset16);
275         if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
276             return single;
277         }
278         return _charAt(source, offset16, single);
279     }
280 
_charAt(CharSequence source, int offset16, char single)281     private static int _charAt(CharSequence source, int offset16, char single) {
282         if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
283             return single;
284         }
285 
286         // Convert the UTF-16 surrogate pair if necessary.
287         // For simplicity in usage, and because the frequency of pairs is
288         // low, look both directions.
289 
290         if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
291             ++offset16;
292             if (source.length() != offset16) {
293                 char trail = source.charAt(offset16);
294                 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
295                         && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
296                     return Character.toCodePoint(single, trail);
297                 }
298             }
299         } else {
300             --offset16;
301             if (offset16 >= 0) {
302                 // single is a trail surrogate so
303                 char lead = source.charAt(offset16);
304                 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
305                         && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
306                     return Character.toCodePoint(lead, single);
307                 }
308             }
309         }
310         return single; // return unmatched surrogate
311     }
312 
313     /**
314      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
315      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
316      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
317      * </a></code>
318      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
319      * character will be returned. If a complete supplementary character is not found the incomplete
320      * character will be returned
321      *
322      * @param source UTF-16 chars string buffer
323      * @param offset16 UTF-16 offset to the start of the character.
324      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
325      *         of that codepoint are the same as in <code>bounds32()</code>.
326      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
327      * @stable ICU 2.1
328      */
charAt(StringBuffer source, int offset16)329     public static int charAt(StringBuffer source, int offset16) {
330         if (offset16 < 0 || offset16 >= source.length()) {
331             throw new StringIndexOutOfBoundsException(offset16);
332         }
333 
334         char single = source.charAt(offset16);
335         if (!isSurrogate(single)) {
336             return single;
337         }
338 
339         // Convert the UTF-16 surrogate pair if necessary.
340         // For simplicity in usage, and because the frequency of pairs is
341         // low, look both directions.
342 
343         if (single <= LEAD_SURROGATE_MAX_VALUE) {
344             ++offset16;
345             if (source.length() != offset16) {
346                 char trail = source.charAt(offset16);
347                 if (isTrailSurrogate(trail))
348                     return Character.toCodePoint(single, trail);
349             }
350         } else {
351             --offset16;
352             if (offset16 >= 0) {
353                 // single is a trail surrogate so
354                 char lead = source.charAt(offset16);
355                 if (isLeadSurrogate(lead)) {
356                     return Character.toCodePoint(lead, single);
357                 }
358             }
359         }
360         return single; // return unmatched surrogate
361     }
362 
363     /**
364      * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
365      * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
366      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
367      * </a></code>
368      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
369      * character will be returned. If a complete supplementary character is not found the incomplete
370      * character will be returned
371      *
372      * @param source Array of UTF-16 chars
373      * @param start Offset to substring in the source array for analyzing
374      * @param limit Offset to substring in the source array for analyzing
375      * @param offset16 UTF-16 offset relative to start
376      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
377      *         of that codepoint are the same as in <code>bounds32()</code>.
378      * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
379      * @stable ICU 2.1
380      */
charAt(char source[], int start, int limit, int offset16)381     public static int charAt(char source[], int start, int limit, int offset16) {
382         offset16 += start;
383         if (offset16 < start || offset16 >= limit) {
384             throw new ArrayIndexOutOfBoundsException(offset16);
385         }
386 
387         char single = source[offset16];
388         if (!isSurrogate(single)) {
389             return single;
390         }
391 
392         // Convert the UTF-16 surrogate pair if necessary.
393         // For simplicity in usage, and because the frequency of pairs is
394         // low, look both directions.
395         if (single <= LEAD_SURROGATE_MAX_VALUE) {
396             offset16++;
397             if (offset16 >= limit) {
398                 return single;
399             }
400             char trail = source[offset16];
401             if (isTrailSurrogate(trail)) {
402                 return Character.toCodePoint(single, trail);
403             }
404         } else { // isTrailSurrogate(single), so
405             if (offset16 == start) {
406                 return single;
407             }
408             offset16--;
409             char lead = source[offset16];
410             if (isLeadSurrogate(lead))
411                 return Character.toCodePoint(lead, single);
412         }
413         return single; // return unmatched surrogate
414     }
415 
416     /**
417      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
418      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
419      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
420      * </a></code>
421      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
422      * character will be returned. If a complete supplementary character is not found the incomplete
423      * character will be returned
424      *
425      * @param source UTF-16 chars string buffer
426      * @param offset16 UTF-16 offset to the start of the character.
427      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
428      *         of that codepoint are the same as in <code>bounds32()</code>.
429      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
430      * @stable ICU 2.1
431      */
charAt(Replaceable source, int offset16)432     public static int charAt(Replaceable source, int offset16) {
433         if (offset16 < 0 || offset16 >= source.length()) {
434             throw new StringIndexOutOfBoundsException(offset16);
435         }
436 
437         char single = source.charAt(offset16);
438         if (!isSurrogate(single)) {
439             return single;
440         }
441 
442         // Convert the UTF-16 surrogate pair if necessary.
443         // For simplicity in usage, and because the frequency of pairs is
444         // low, look both directions.
445 
446         if (single <= LEAD_SURROGATE_MAX_VALUE) {
447             ++offset16;
448             if (source.length() != offset16) {
449                 char trail = source.charAt(offset16);
450                 if (isTrailSurrogate(trail))
451                     return Character.toCodePoint(single, trail);
452             }
453         } else {
454             --offset16;
455             if (offset16 >= 0) {
456                 // single is a trail surrogate so
457                 char lead = source.charAt(offset16);
458                 if (isLeadSurrogate(lead)) {
459                     return Character.toCodePoint(lead, single);
460                 }
461             }
462         }
463         return single; // return unmatched surrogate
464     }
465 
466     /**
467      * Determines how many chars this char32 requires. If a validity check is required, use <code>
468      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
469      * on char32 before calling.
470      *
471      * @param char32 The input codepoint.
472      * @return 2 if is in supplementary space, otherwise 1.
473      * @stable ICU 2.1
474      */
getCharCount(int char32)475     public static int getCharCount(int char32) {
476         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
477             return 1;
478         }
479         return 2;
480     }
481 
482     /**
483      * Returns the type of the boundaries around the char at offset16. Used for random access.
484      *
485      * @param source Text to analyse
486      * @param offset16 UTF-16 offset
487      * @return
488      *            <ul>
489      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1]
490      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
491      *            are [offset16, offset16 + 2]
492      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
493      *            bounds are [offset16 - 1, offset16 + 1]
494      *            </ul>
495      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
496      *            can be gotten by: [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)].
497      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
498      * @stable ICU 2.1
499      */
bounds(String source, int offset16)500     public static int bounds(String source, int offset16) {
501         char ch = source.charAt(offset16);
502         if (isSurrogate(ch)) {
503             if (isLeadSurrogate(ch)) {
504                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
505                     return LEAD_SURROGATE_BOUNDARY;
506                 }
507             } else {
508                 // isTrailSurrogate(ch), so
509                 --offset16;
510                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
511                     return TRAIL_SURROGATE_BOUNDARY;
512                 }
513             }
514         }
515         return SINGLE_CHAR_BOUNDARY;
516     }
517 
518     /**
519      * Returns the type of the boundaries around the char at offset16. Used for random access.
520      *
521      * @param source String buffer to analyse
522      * @param offset16 UTF16 offset
523      * @return
524      *            <ul>
525      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1]
526      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
527      *            are [offset16, offset16 + 2]
528      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
529      *            bounds are [offset16 - 1, offset16 + 1]
530      *            </ul>
531      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
532      *            can be gotten by: [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)].
533      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
534      * @stable ICU 2.1
535      */
bounds(StringBuffer source, int offset16)536     public static int bounds(StringBuffer source, int offset16) {
537         char ch = source.charAt(offset16);
538         if (isSurrogate(ch)) {
539             if (isLeadSurrogate(ch)) {
540                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
541                     return LEAD_SURROGATE_BOUNDARY;
542                 }
543             } else {
544                 // isTrailSurrogate(ch), so
545                 --offset16;
546                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
547                     return TRAIL_SURROGATE_BOUNDARY;
548                 }
549             }
550         }
551         return SINGLE_CHAR_BOUNDARY;
552     }
553 
554     /**
555      * Returns the type of the boundaries around the char at offset16. Used for random access. Note
556      * that the boundaries are determined with respect to the subarray, hence the char array
557      * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
558      *
559      * @param source Char array to analyse
560      * @param start Offset to substring in the source array for analyzing
561      * @param limit Offset to substring in the source array for analyzing
562      * @param offset16 UTF16 offset relative to start
563      * @return
564      *            <ul>
565      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
566      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
567      *            are [offset16, offset16 + 2]
568      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
569      *            bounds are [offset16 - 1, offset16 + 1]
570      *            </ul>
571      *            For bit-twiddlers, the boundary values for these are chosen so that the boundaries
572      *            can be gotten by: [offset16 - (boundvalue &gt;&gt; 2), offset16 + (boundvalue &amp; 3)].
573      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
574      * @stable ICU 2.1
575      */
bounds(char source[], int start, int limit, int offset16)576     public static int bounds(char source[], int start, int limit, int offset16) {
577         offset16 += start;
578         if (offset16 < start || offset16 >= limit) {
579             throw new ArrayIndexOutOfBoundsException(offset16);
580         }
581         char ch = source[offset16];
582         if (isSurrogate(ch)) {
583             if (isLeadSurrogate(ch)) {
584                 ++offset16;
585                 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
586                     return LEAD_SURROGATE_BOUNDARY;
587                 }
588             } else { // isTrailSurrogate(ch), so
589                 --offset16;
590                 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
591                     return TRAIL_SURROGATE_BOUNDARY;
592                 }
593             }
594         }
595         return SINGLE_CHAR_BOUNDARY;
596     }
597 
598     /**
599      * Determines whether the code point is a surrogate.
600      *
601      * @param codePoint The input character.
602      *        (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
603      * @return true If the input code point is a surrogate.
604      * @stable ICU 70
605      */
isSurrogate(int codePoint)606     public static boolean isSurrogate(int codePoint) {
607         return (codePoint & SURROGATE_BITMASK) == SURROGATE_BITS;
608     }
609 
610     /**
611      * Determines whether the code point is a trail surrogate.
612      *
613      * @param codePoint The input character.
614      *        (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
615      * @return true If the input code point is a trail surrogate.
616      * @stable ICU 70
617      */
isTrailSurrogate(int codePoint)618     public static boolean isTrailSurrogate(int codePoint) {
619         return (codePoint & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
620     }
621 
622     /**
623      * Determines whether the code point is a lead surrogate.
624      *
625      * @param codePoint The input character.
626      *        (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
627      * @return true If the input code point is a lead surrogate
628      * @stable ICU 70
629      */
isLeadSurrogate(int codePoint)630     public static boolean isLeadSurrogate(int codePoint) {
631         return (codePoint & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
632     }
633 
634     /**
635      * Returns the lead surrogate. If a validity check is required, use
636      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
637      * before calling.
638      *
639      * @param char32 The input character.
640      * @return lead surrogate if the getCharCount(ch) is 2; <br>
641      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
642      * @stable ICU 2.1
643      */
getLeadSurrogate(int char32)644     public static char getLeadSurrogate(int char32) {
645         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
646             return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
647         }
648         return 0;
649     }
650 
651     /**
652      * Returns the trail surrogate. If a validity check is required, use
653      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
654      * before calling.
655      *
656      * @param char32 The input character.
657      * @return the trail surrogate if the getCharCount(ch) is 2; <br>
658      *         otherwise the character itself
659      * @stable ICU 2.1
660      */
getTrailSurrogate(int char32)661     public static char getTrailSurrogate(int char32) {
662         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
663             return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
664         }
665         return (char) char32;
666     }
667 
668     /**
669      * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
670      * containing the UTF-32 value in UTF16 format. If a validity check is required, use
671      * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before calling.
672      *
673      * @param char32 The input character.
674      * @return string value of char32 in UTF16 format
675      * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint.
676      * @stable ICU 2.1
677      */
valueOf(int char32)678     public static String valueOf(int char32) {
679         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
680             throw new IllegalArgumentException("Illegal codepoint");
681         }
682         return toString(char32);
683     }
684 
685     /**
686      * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
687      * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
688      * character, the whole supplementary codepoint will be returned. If a validity check is
689      * required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the
690      * codepoint at offset16 before calling. The result returned will be a newly created String
691      * obtained by calling source.substring(..) with the appropriate indexes.
692      *
693      * @param source The input string.
694      * @param offset16 The UTF16 index to the codepoint in source
695      * @return string value of char32 in UTF16 format
696      * @stable ICU 2.1
697      */
valueOf(String source, int offset16)698     public static String valueOf(String source, int offset16) {
699         switch (bounds(source, offset16)) {
700         case LEAD_SURROGATE_BOUNDARY:
701             return source.substring(offset16, offset16 + 2);
702         case TRAIL_SURROGATE_BOUNDARY:
703             return source.substring(offset16 - 1, offset16 + 1);
704         default:
705             return source.substring(offset16, offset16 + 1);
706         }
707     }
708 
709     /**
710      * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
711      * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
712      * surrogate character, the whole supplementary codepoint will be returned. If a validity check
713      * is required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on
714      * the codepoint at offset16 before calling. The result returned will be a newly created String
715      * obtained by calling source.substring(..) with the appropriate indexes.
716      *
717      * @param source The input string buffer.
718      * @param offset16 The UTF16 index to the codepoint in source
719      * @return string value of char32 in UTF16 format
720      * @stable ICU 2.1
721      */
valueOf(StringBuffer source, int offset16)722     public static String valueOf(StringBuffer source, int offset16) {
723         switch (bounds(source, offset16)) {
724         case LEAD_SURROGATE_BOUNDARY:
725             return source.substring(offset16, offset16 + 2);
726         case TRAIL_SURROGATE_BOUNDARY:
727             return source.substring(offset16 - 1, offset16 + 1);
728         default:
729             return source.substring(offset16, offset16 + 1);
730         }
731     }
732 
733     /**
734      * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
735      * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
736      * returned, except when either the leading or trailing surrogate character lies out of the
737      * specified subarray. In the latter case, only the surrogate character within bounds will be
738      * returned. If a validity check is required, use
739      * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the codepoint at
740      * offset16 before calling. The result returned will be a newly created String containing the
741      * relevant characters.
742      *
743      * @param source The input char array.
744      * @param start Start index of the subarray
745      * @param limit End index of the subarray
746      * @param offset16 The UTF16 index to the codepoint in source relative to start
747      * @return string value of char32 in UTF16 format
748      * @stable ICU 2.1
749      */
valueOf(char source[], int start, int limit, int offset16)750     public static String valueOf(char source[], int start, int limit, int offset16) {
751         switch (bounds(source, start, limit, offset16)) {
752         case LEAD_SURROGATE_BOUNDARY:
753             return new String(source, start + offset16, 2);
754         case TRAIL_SURROGATE_BOUNDARY:
755             return new String(source, start + offset16 - 1, 2);
756         }
757         return new String(source, start + offset16, 1);
758     }
759 
760     /**
761      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
762      * the {@link UTF16 class description} for notes on roundtripping.
763      *
764      * @param source The UTF-16 string
765      * @param offset32 UTF-32 offset
766      * @return UTF-16 offset
767      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
768      * @stable ICU 2.1
769      */
findOffsetFromCodePoint(String source, int offset32)770     public static int findOffsetFromCodePoint(String source, int offset32) {
771         char ch;
772         int size = source.length(), result = 0, count = offset32;
773         if (offset32 < 0 || offset32 > size) {
774             throw new StringIndexOutOfBoundsException(offset32);
775         }
776         while (result < size && count > 0) {
777             ch = source.charAt(result);
778             if (isLeadSurrogate(ch) && ((result + 1) < size)
779                     && isTrailSurrogate(source.charAt(result + 1))) {
780                 result++;
781             }
782 
783             count--;
784             result++;
785         }
786         if (count != 0) {
787             throw new StringIndexOutOfBoundsException(offset32);
788         }
789         return result;
790     }
791 
792     /**
793      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
794      * the {@link UTF16 class description} for notes on roundtripping.
795      *
796      * @param source The UTF-16 string buffer
797      * @param offset32 UTF-32 offset
798      * @return UTF-16 offset
799      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
800      * @stable ICU 2.1
801      */
findOffsetFromCodePoint(StringBuffer source, int offset32)802     public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
803         char ch;
804         int size = source.length(), result = 0, count = offset32;
805         if (offset32 < 0 || offset32 > size) {
806             throw new StringIndexOutOfBoundsException(offset32);
807         }
808         while (result < size && count > 0) {
809             ch = source.charAt(result);
810             if (isLeadSurrogate(ch) && ((result + 1) < size)
811                     && isTrailSurrogate(source.charAt(result + 1))) {
812                 result++;
813             }
814 
815             count--;
816             result++;
817         }
818         if (count != 0) {
819             throw new StringIndexOutOfBoundsException(offset32);
820         }
821         return result;
822     }
823 
824     /**
825      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
826      * the {@link UTF16 class description} for notes on roundtripping.
827      *
828      * @param source The UTF-16 char array whose substring is to be analysed
829      * @param start Offset of the substring to be analysed
830      * @param limit Offset of the substring to be analysed
831      * @param offset32 UTF-32 offset relative to start
832      * @return UTF-16 offset relative to start
833      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
834      * @stable ICU 2.1
835      */
findOffsetFromCodePoint(char source[], int start, int limit, int offset32)836     public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
837         char ch;
838         int result = start, count = offset32;
839         if (offset32 > limit - start) {
840             throw new ArrayIndexOutOfBoundsException(offset32);
841         }
842         while (result < limit && count > 0) {
843             ch = source[result];
844             if (isLeadSurrogate(ch) && ((result + 1) < limit)
845                     && isTrailSurrogate(source[result + 1])) {
846                 result++;
847             }
848 
849             count--;
850             result++;
851         }
852         if (count != 0) {
853             throw new ArrayIndexOutOfBoundsException(offset32);
854         }
855         return result - start;
856     }
857 
858     /**
859      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
860      * UTF-16 offset. Used for random access. See the {@link UTF16 class description} for
861      * notes on roundtripping.<br>
862      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
863      * of the <strong>lead</strong> of the pair is returned. </i>
864      * <p>
865      * To find the UTF-32 length of a string, use:
866      *
867      * <pre>
868      * len32 = countCodePoint(source, source.length());
869      * </pre>
870      *
871      * @param source Text to analyse
872      * @param offset16 UTF-16 offset &lt; source text length.
873      * @return UTF-32 offset
874      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
875      * @stable ICU 2.1
876      */
findCodePointOffset(String source, int offset16)877     public static int findCodePointOffset(String source, int offset16) {
878         if (offset16 < 0 || offset16 > source.length()) {
879             throw new StringIndexOutOfBoundsException(offset16);
880         }
881 
882         int result = 0;
883         char ch;
884         boolean hadLeadSurrogate = false;
885 
886         for (int i = 0; i < offset16; ++i) {
887             ch = source.charAt(i);
888             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
889                 hadLeadSurrogate = false; // count valid trail as zero
890             } else {
891                 hadLeadSurrogate = isLeadSurrogate(ch);
892                 ++result; // count others as 1
893             }
894         }
895 
896         if (offset16 == source.length()) {
897             return result;
898         }
899 
900         // end of source being the less significant surrogate character
901         // shift result back to the start of the supplementary character
902         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
903             result--;
904         }
905 
906         return result;
907     }
908 
909     /**
910      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
911      * offset. Used for random access. See the {@link UTF16 class description} for notes on
912      * roundtripping.<br>
913      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
914      * of the <strong>lead</strong> of the pair is returned. </i>
915      * <p>
916      * To find the UTF-32 length of a string, use:
917      *
918      * <pre>
919      * len32 = countCodePoint(source);
920      * </pre>
921      *
922      * @param source Text to analyse
923      * @param offset16 UTF-16 offset &lt; source text length.
924      * @return UTF-32 offset
925      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
926      * @stable ICU 2.1
927      */
findCodePointOffset(StringBuffer source, int offset16)928     public static int findCodePointOffset(StringBuffer source, int offset16) {
929         if (offset16 < 0 || offset16 > source.length()) {
930             throw new StringIndexOutOfBoundsException(offset16);
931         }
932 
933         int result = 0;
934         char ch;
935         boolean hadLeadSurrogate = false;
936 
937         for (int i = 0; i < offset16; ++i) {
938             ch = source.charAt(i);
939             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
940                 hadLeadSurrogate = false; // count valid trail as zero
941             } else {
942                 hadLeadSurrogate = isLeadSurrogate(ch);
943                 ++result; // count others as 1
944             }
945         }
946 
947         if (offset16 == source.length()) {
948             return result;
949         }
950 
951         // end of source being the less significant surrogate character
952         // shift result back to the start of the supplementary character
953         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
954             result--;
955         }
956 
957         return result;
958     }
959 
960     /**
961      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
962      * offset. Used for random access. See the {@link UTF16 class description} for notes on
963      * roundtripping.<br>
964      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
965      * of the <strong>lead</strong> of the pair is returned. </i>
966      * <p>
967      * To find the UTF-32 length of a substring, use:
968      *
969      * <pre>
970      * len32 = countCodePoint(source, start, limit);
971      * </pre>
972      *
973      * @param source Text to analyse
974      * @param start Offset of the substring
975      * @param limit Offset of the substring
976      * @param offset16 UTF-16 relative to start
977      * @return UTF-32 offset relative to start
978      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
979      * @stable ICU 2.1
980      */
findCodePointOffset(char source[], int start, int limit, int offset16)981     public static int findCodePointOffset(char source[], int start, int limit, int offset16) {
982         offset16 += start;
983         if (offset16 > limit) {
984             throw new StringIndexOutOfBoundsException(offset16);
985         }
986 
987         int result = 0;
988         char ch;
989         boolean hadLeadSurrogate = false;
990 
991         for (int i = start; i < offset16; ++i) {
992             ch = source[i];
993             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
994                 hadLeadSurrogate = false; // count valid trail as zero
995             } else {
996                 hadLeadSurrogate = isLeadSurrogate(ch);
997                 ++result; // count others as 1
998             }
999         }
1000 
1001         if (offset16 == limit) {
1002             return result;
1003         }
1004 
1005         // end of source being the less significant surrogate character
1006         // shift result back to the start of the supplementary character
1007         if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
1008             result--;
1009         }
1010 
1011         return result;
1012     }
1013 
1014     /**
1015      * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required,
1016      * use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before
1017      * calling.
1018      *
1019      * @param target The buffer to append to
1020      * @param char32 Value to append.
1021      * @return the updated StringBuffer
1022      * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints
1023      * @stable ICU 2.1
1024      */
append(StringBuffer target, int char32)1025     public static StringBuffer append(StringBuffer target, int char32) {
1026         // Check for irregular values
1027         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1028             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
1029         }
1030 
1031         // Write the UTF-16 values
1032         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1033             target.append(getLeadSurrogate(char32));
1034             target.append(getTrailSurrogate(char32));
1035         } else {
1036             target.append((char) char32);
1037         }
1038         return target;
1039     }
1040 
1041     /**
1042      * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
1043      * convenience.
1044      *
1045      * @param target The buffer to append to
1046      * @param cp The code point to append
1047      * @return the updated StringBuffer
1048      * @throws IllegalArgumentException If cp is not a valid code point
1049      * @stable ICU 3.0
1050      */
appendCodePoint(StringBuffer target, int cp)1051     public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
1052         return append(target, cp);
1053     }
1054 
1055     /**
1056      * Adds a codepoint to offset16 position of the argument char array.
1057      *
1058      * @param target Char array to be append with the new code point
1059      * @param limit UTF16 offset which the codepoint will be appended.
1060      * @param char32 Code point to be appended
1061      * @return offset after char32 in the array.
1062      * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not
1063      *                lie within the range of the Unicode codepoints.
1064      * @stable ICU 2.1
1065      */
append(char[] target, int limit, int char32)1066     public static int append(char[] target, int limit, int char32) {
1067         // Check for irregular values
1068         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1069             throw new IllegalArgumentException("Illegal codepoint");
1070         }
1071         // Write the UTF-16 values
1072         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1073             target[limit++] = getLeadSurrogate(char32);
1074             target[limit++] = getTrailSurrogate(char32);
1075         } else {
1076             target[limit++] = (char) char32;
1077         }
1078         return limit;
1079     }
1080 
1081     /**
1082      * Number of codepoints in a UTF16 String
1083      *
1084      * @param source UTF16 string
1085      * @return number of codepoint in string
1086      * @stable ICU 2.1
1087      */
countCodePoint(String source)1088     public static int countCodePoint(String source) {
1089         if (source == null || source.length() == 0) {
1090             return 0;
1091         }
1092         return findCodePointOffset(source, source.length());
1093     }
1094 
1095     /**
1096      * Number of codepoints in a UTF16 String buffer
1097      *
1098      * @param source UTF16 string buffer
1099      * @return number of codepoint in string
1100      * @stable ICU 2.1
1101      */
countCodePoint(StringBuffer source)1102     public static int countCodePoint(StringBuffer source) {
1103         if (source == null || source.length() == 0) {
1104             return 0;
1105         }
1106         return findCodePointOffset(source, source.length());
1107     }
1108 
1109     /**
1110      * Number of codepoints in a UTF16 char array substring
1111      *
1112      * @param source UTF16 char array
1113      * @param start Offset of the substring
1114      * @param limit Offset of the substring
1115      * @return number of codepoint in the substring
1116      * @exception IndexOutOfBoundsException If start and limit are not valid.
1117      * @stable ICU 2.1
1118      */
countCodePoint(char source[], int start, int limit)1119     public static int countCodePoint(char source[], int start, int limit) {
1120         if (source == null || source.length == 0) {
1121             return 0;
1122         }
1123         return findCodePointOffset(source, start, limit, limit - start);
1124     }
1125 
1126     /**
1127      * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
1128      * non-supplementary codepoint with a supplementary and vice versa.
1129      *
1130      * @param target Stringbuffer
1131      * @param offset16 UTF16 position to insert into
1132      * @param char32 Code point
1133      * @stable ICU 2.1
1134      */
setCharAt(StringBuffer target, int offset16, int char32)1135     public static void setCharAt(StringBuffer target, int offset16, int char32) {
1136         int count = 1;
1137         char single = target.charAt(offset16);
1138 
1139         if (isSurrogate(single)) {
1140             // pairs of the surrogate with offset16 at the lead char found
1141             if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
1142                     && isTrailSurrogate(target.charAt(offset16 + 1))) {
1143                 count++;
1144             } else {
1145                 // pairs of the surrogate with offset16 at the trail char
1146                 // found
1147                 if (isTrailSurrogate(single) && (offset16 > 0)
1148                         && isLeadSurrogate(target.charAt(offset16 - 1))) {
1149                     offset16--;
1150                     count++;
1151                 }
1152             }
1153         }
1154         target.replace(offset16, offset16 + count, valueOf(char32));
1155     }
1156 
1157     /**
1158      * Set a code point into a UTF16 position in a char array. Adjusts target according if we are
1159      * replacing a non-supplementary codepoint with a supplementary and vice versa.
1160      *
1161      * @param target char array
1162      * @param limit numbers of valid chars in target, different from target.length. limit counts the
1163      *            number of chars in target that represents a string, not the size of array target.
1164      * @param offset16 UTF16 position to insert into
1165      * @param char32 code point
1166      * @return new number of chars in target that represents a string
1167      * @exception IndexOutOfBoundsException if offset16 is out of range
1168      * @stable ICU 2.1
1169      */
setCharAt(char target[], int limit, int offset16, int char32)1170     public static int setCharAt(char target[], int limit, int offset16, int char32) {
1171         if (offset16 >= limit) {
1172             throw new ArrayIndexOutOfBoundsException(offset16);
1173         }
1174         int count = 1;
1175         char single = target[offset16];
1176 
1177         if (isSurrogate(single)) {
1178             // pairs of the surrogate with offset16 at the lead char found
1179             if (isLeadSurrogate(single) && (target.length > offset16 + 1)
1180                     && isTrailSurrogate(target[offset16 + 1])) {
1181                 count++;
1182             } else {
1183                 // pairs of the surrogate with offset16 at the trail char
1184                 // found
1185                 if (isTrailSurrogate(single) && (offset16 > 0)
1186                         && isLeadSurrogate(target[offset16 - 1])) {
1187                     offset16--;
1188                     count++;
1189                 }
1190             }
1191         }
1192 
1193         String str = valueOf(char32);
1194         int result = limit;
1195         int strlength = str.length();
1196         target[offset16] = str.charAt(0);
1197         if (count == strlength) {
1198             if (count == 2) {
1199                 target[offset16 + 1] = str.charAt(1);
1200             }
1201         } else {
1202             // this is not exact match in space, we'll have to do some
1203             // shifting
1204             System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit
1205                     - (offset16 + count));
1206             if (count < strlength) {
1207                 // char32 is a supplementary character trying to squeeze into
1208                 // a non-supplementary space
1209                 target[offset16 + 1] = str.charAt(1);
1210                 result++;
1211                 if (result < target.length) {
1212                     target[result] = 0;
1213                 }
1214             } else {
1215                 // char32 is a non-supplementary character trying to fill
1216                 // into a supplementary space
1217                 result--;
1218                 target[result] = 0;
1219             }
1220         }
1221         return result;
1222     }
1223 
1224     /**
1225      * Shifts offset16 by the argument number of codepoints
1226      *
1227      * @param source string
1228      * @param offset16 UTF16 position to shift
1229      * @param shift32 number of codepoints to shift
1230      * @return new shifted offset16
1231      * @exception IndexOutOfBoundsException if the new offset16 is out of bounds.
1232      * @stable ICU 2.1
1233      */
moveCodePointOffset(String source, int offset16, int shift32)1234     public static int moveCodePointOffset(String source, int offset16, int shift32) {
1235         int result = offset16;
1236         int size = source.length();
1237         int count;
1238         char ch;
1239         if (offset16 < 0 || offset16 > size) {
1240             throw new StringIndexOutOfBoundsException(offset16);
1241         }
1242         if (shift32 > 0) {
1243             if (shift32 + offset16 > size) {
1244                 throw new StringIndexOutOfBoundsException(offset16);
1245             }
1246             count = shift32;
1247             while (result < size && count > 0) {
1248                 ch = source.charAt(result);
1249                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1250                         && isTrailSurrogate(source.charAt(result + 1))) {
1251                     result++;
1252                 }
1253                 count--;
1254                 result++;
1255             }
1256         } else {
1257             if (offset16 + shift32 < 0) {
1258                 throw new StringIndexOutOfBoundsException(offset16);
1259             }
1260             for (count = -shift32; count > 0; count--) {
1261                 result--;
1262                 if (result < 0) {
1263                     break;
1264                 }
1265                 ch = source.charAt(result);
1266                 if (isTrailSurrogate(ch) && result > 0
1267                         && isLeadSurrogate(source.charAt(result - 1))) {
1268                     result--;
1269                 }
1270             }
1271         }
1272         if (count != 0) {
1273             throw new StringIndexOutOfBoundsException(shift32);
1274         }
1275         return result;
1276     }
1277 
1278     /**
1279      * Shifts offset16 by the argument number of codepoints
1280      *
1281      * @param source String buffer
1282      * @param offset16 UTF16 position to shift
1283      * @param shift32 Number of codepoints to shift
1284      * @return new shifted offset16
1285      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds.
1286      * @stable ICU 2.1
1287      */
moveCodePointOffset(StringBuffer source, int offset16, int shift32)1288     public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) {
1289         int result = offset16;
1290         int size = source.length();
1291         int count;
1292         char ch;
1293         if (offset16 < 0 || offset16 > size) {
1294             throw new StringIndexOutOfBoundsException(offset16);
1295         }
1296         if (shift32 > 0) {
1297             if (shift32 + offset16 > size) {
1298                 throw new StringIndexOutOfBoundsException(offset16);
1299             }
1300             count = shift32;
1301             while (result < size && count > 0) {
1302                 ch = source.charAt(result);
1303                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1304                         && isTrailSurrogate(source.charAt(result + 1))) {
1305                     result++;
1306                 }
1307                 count--;
1308                 result++;
1309             }
1310         } else {
1311             if (offset16 + shift32 < 0) {
1312                 throw new StringIndexOutOfBoundsException(offset16);
1313             }
1314             for (count = -shift32; count > 0; count--) {
1315                 result--;
1316                 if (result < 0) {
1317                     break;
1318                 }
1319                 ch = source.charAt(result);
1320                 if (isTrailSurrogate(ch) && result > 0
1321                         && isLeadSurrogate(source.charAt(result - 1))) {
1322                     result--;
1323                 }
1324             }
1325         }
1326         if (count != 0) {
1327             throw new StringIndexOutOfBoundsException(shift32);
1328         }
1329         return result;
1330     }
1331 
1332     /**
1333      * Shifts offset16 by the argument number of codepoints within a subarray.
1334      *
1335      * @param source Char array
1336      * @param start Position of the subarray to be performed on
1337      * @param limit Position of the subarray to be performed on
1338      * @param offset16 UTF16 position to shift relative to start
1339      * @param shift32 Number of codepoints to shift
1340      * @return new shifted offset16 relative to start
1341      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the
1342      *                subarray bounds are out of range.
1343      * @stable ICU 2.1
1344      */
moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32)1345     public static int moveCodePointOffset(char source[], int start, int limit, int offset16,
1346             int shift32) {
1347         int size = source.length;
1348         int count;
1349         char ch;
1350         int result = offset16 + start;
1351         if (start < 0 || limit < start) {
1352             throw new StringIndexOutOfBoundsException(start);
1353         }
1354         if (limit > size) {
1355             throw new StringIndexOutOfBoundsException(limit);
1356         }
1357         if (offset16 < 0 || result > limit) {
1358             throw new StringIndexOutOfBoundsException(offset16);
1359         }
1360         if (shift32 > 0) {
1361             if (shift32 + result > size) {
1362                 throw new StringIndexOutOfBoundsException(result);
1363             }
1364             count = shift32;
1365             while (result < limit && count > 0) {
1366                 ch = source[result];
1367                 if (isLeadSurrogate(ch) && (result + 1 < limit)
1368                         && isTrailSurrogate(source[result + 1])) {
1369                     result++;
1370                 }
1371                 count--;
1372                 result++;
1373             }
1374         } else {
1375             if (result + shift32 < start) {
1376                 throw new StringIndexOutOfBoundsException(result);
1377             }
1378             for (count = -shift32; count > 0; count--) {
1379                 result--;
1380                 if (result < start) {
1381                     break;
1382                 }
1383                 ch = source[result];
1384                 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
1385                     result--;
1386                 }
1387             }
1388         }
1389         if (count != 0) {
1390             throw new StringIndexOutOfBoundsException(shift32);
1391         }
1392         result -= start;
1393         return result;
1394     }
1395 
1396     /**
1397      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1398      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1399      * codepoint. The length of target increases by one if codepoint is non-supplementary, 2
1400      * otherwise.
1401      * <p>
1402      * The overall effect is exactly as if the argument were converted to a string by the method
1403      * valueOf(char) and the characters in that string were then inserted into target at the
1404      * position indicated by offset16.
1405      * </p>
1406      * <p>
1407      * The offset argument must be greater than or equal to 0, and less than or equal to the length
1408      * of source.
1409      *
1410      * @param target String buffer to insert to
1411      * @param offset16 Offset which char32 will be inserted in
1412      * @param char32 Codepoint to be inserted
1413      * @return a reference to target
1414      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1415      * @stable ICU 2.1
1416      */
insert(StringBuffer target, int offset16, int char32)1417     public static StringBuffer insert(StringBuffer target, int offset16, int char32) {
1418         String str = valueOf(char32);
1419         if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1420             offset16++;
1421         }
1422         target.insert(offset16, str);
1423         return target;
1424     }
1425 
1426     /**
1427      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1428      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1429      * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise.
1430      * <p>
1431      * The overall effect is exactly as if the argument were converted to a string by the method
1432      * valueOf(char) and the characters in that string were then inserted into target at the
1433      * position indicated by offset16.
1434      * </p>
1435      * <p>
1436      * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
1437      *
1438      * @param target Char array to insert to
1439      * @param limit End index of the char array, limit &lt;= target.length
1440      * @param offset16 Offset which char32 will be inserted in
1441      * @param char32 Codepoint to be inserted
1442      * @return new limit size
1443      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1444      * @stable ICU 2.1
1445      */
insert(char target[], int limit, int offset16, int char32)1446     public static int insert(char target[], int limit, int offset16, int char32) {
1447         String str = valueOf(char32);
1448         if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1449             offset16++;
1450         }
1451         int size = str.length();
1452         if (limit + size > target.length) {
1453             throw new ArrayIndexOutOfBoundsException(offset16 + size);
1454         }
1455         System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
1456         target[offset16] = str.charAt(0);
1457         if (size == 2) {
1458             target[offset16 + 1] = str.charAt(1);
1459         }
1460         return limit + size;
1461     }
1462 
1463     /**
1464      * Removes the codepoint at the specified position in this target (shortening target by 1
1465      * character if the codepoint is a non-supplementary, 2 otherwise).
1466      *
1467      * @param target String buffer to remove codepoint from
1468      * @param offset16 Offset which the codepoint will be removed
1469      * @return a reference to target
1470      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1471      * @stable ICU 2.1
1472      */
delete(StringBuffer target, int offset16)1473     public static StringBuffer delete(StringBuffer target, int offset16) {
1474         int count = 1;
1475         switch (bounds(target, offset16)) {
1476         case LEAD_SURROGATE_BOUNDARY:
1477             count++;
1478             break;
1479         case TRAIL_SURROGATE_BOUNDARY:
1480             count++;
1481             offset16--;
1482             break;
1483         }
1484         target.delete(offset16, offset16 + count);
1485         return target;
1486     }
1487 
1488     /**
1489      * Removes the codepoint at the specified position in this target (shortening target by 1
1490      * character if the codepoint is a non-supplementary, 2 otherwise).
1491      *
1492      * @param target String buffer to remove codepoint from
1493      * @param limit End index of the char array, limit &lt;= target.length
1494      * @param offset16 Offset which the codepoint will be removed
1495      * @return a new limit size
1496      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1497      * @stable ICU 2.1
1498      */
delete(char target[], int limit, int offset16)1499     public static int delete(char target[], int limit, int offset16) {
1500         int count = 1;
1501         switch (bounds(target, 0, limit, offset16)) {
1502         case LEAD_SURROGATE_BOUNDARY:
1503             count++;
1504             break;
1505         case TRAIL_SURROGATE_BOUNDARY:
1506             count++;
1507             offset16--;
1508             break;
1509         }
1510         System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
1511         target[limit - count] = 0;
1512         return limit - count;
1513     }
1514 
1515     /**
1516      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1517      * the argument codepoint. I.e., the smallest index <code>i</code> such that
1518      * <code>UTF16.charAt(source, i) ==
1519      * char32</code> is true.
1520      * <p>
1521      * If no such character occurs in this string, then -1 is returned.
1522      * </p>
1523      * <p>
1524      * Examples:<br>
1525      * UTF16.indexOf("abc", 'a') returns 0<br>
1526      * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1527      * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1528      * </p>
1529      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1530      * characters to its fullest.
1531      *
1532      * @param source UTF16 format Unicode string that will be searched
1533      * @param char32 Codepoint to search for
1534      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1535      *         -1 if the codepoint does not occur.
1536      * @stable ICU 2.6
1537      */
indexOf(String source, int char32)1538     public static int indexOf(String source, int char32) {
1539         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1540             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1541         }
1542         // non-surrogate bmp
1543         if (char32 < LEAD_SURROGATE_MIN_VALUE
1544                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1545             return source.indexOf((char) char32);
1546         }
1547         // surrogate
1548         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1549             int result = source.indexOf((char) char32);
1550             if (result >= 0) {
1551                 if (isLeadSurrogate(char32) && (result < source.length() - 1)
1552                         && isTrailSurrogate(source.charAt(result + 1))) {
1553                     return indexOf(source, char32, result + 1);
1554                 }
1555                 // trail surrogate
1556                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1557                     return indexOf(source, char32, result + 1);
1558                 }
1559             }
1560             return result;
1561         }
1562         // supplementary
1563         String char32str = toString(char32);
1564         return source.indexOf(char32str);
1565     }
1566 
1567     /**
1568      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1569      * the argument string str. This method is implemented based on codepoints, hence a "lead
1570      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1571      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1572      * character before str found at in source will not have a valid match. Vice versa for lead
1573      * surrogates that ends str. See example below.
1574      * <p>
1575      * If no such string str occurs in this source, then -1 is returned.
1576      * </p>
1577      * <p>
1578      * Examples:<br>
1579      * UTF16.indexOf("abc", "ab") returns 0<br>
1580      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1581      * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1582      * </p>
1583      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1584      * characters to its fullest.
1585      *
1586      * @param source UTF16 format Unicode string that will be searched
1587      * @param str UTF16 format Unicode string to search for
1588      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1589      *         -1 if the codepoint does not occur.
1590      * @stable ICU 2.6
1591      */
indexOf(String source, String str)1592     public static int indexOf(String source, String str) {
1593         int strLength = str.length();
1594         // non-surrogate ends
1595         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1596             return source.indexOf(str);
1597         }
1598 
1599         int result = source.indexOf(str);
1600         int resultEnd = result + strLength;
1601         if (result >= 0) {
1602             // check last character
1603             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1604                     && isTrailSurrogate(source.charAt(resultEnd + 1))) {
1605                 return indexOf(source, str, resultEnd + 1);
1606             }
1607             // check first character which is a trail surrogate
1608             if (isTrailSurrogate(str.charAt(0)) && result > 0
1609                     && isLeadSurrogate(source.charAt(result - 1))) {
1610                 return indexOf(source, str, resultEnd + 1);
1611             }
1612         }
1613         return result;
1614     }
1615 
1616     /**
1617      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1618      * the argument codepoint. I.e., the smallest index i such that: <br>
1619      * (UTF16.charAt(source, i) == char32 &amp;&amp; i &gt;= fromIndex) is true.
1620      * <p>
1621      * If no such character occurs in this string, then -1 is returned.
1622      * </p>
1623      * <p>
1624      * Examples:<br>
1625      * UTF16.indexOf("abc", 'a', 1) returns -1<br>
1626      * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
1627      * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
1628      * </p>
1629      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1630      * characters to its fullest.
1631      *
1632      * @param source UTF16 format Unicode string that will be searched
1633      * @param char32 Codepoint to search for
1634      * @param fromIndex The index to start the search from.
1635      * @return the index of the first occurrence of the codepoint in the argument Unicode string at
1636      *         or after fromIndex, or -1 if the codepoint does not occur.
1637      * @stable ICU 2.6
1638      */
indexOf(String source, int char32, int fromIndex)1639     public static int indexOf(String source, int char32, int fromIndex) {
1640         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1641             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1642         }
1643         // non-surrogate bmp
1644         if (char32 < LEAD_SURROGATE_MIN_VALUE
1645                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1646             return source.indexOf((char) char32, fromIndex);
1647         }
1648         // surrogate
1649         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1650             int result = source.indexOf((char) char32, fromIndex);
1651             if (result >= 0) {
1652                 if (isLeadSurrogate(char32) && (result < source.length() - 1)
1653                         && isTrailSurrogate(source.charAt(result + 1))) {
1654                     return indexOf(source, char32, result + 1);
1655                 }
1656                 // trail surrogate
1657                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1658                     return indexOf(source, char32, result + 1);
1659                 }
1660             }
1661             return result;
1662         }
1663         // supplementary
1664         String char32str = toString(char32);
1665         return source.indexOf(char32str, fromIndex);
1666     }
1667 
1668     /**
1669      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1670      * the argument string str. This method is implemented based on codepoints, hence a "lead
1671      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1672      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1673      * character before str found at in source will not have a valid match. Vice versa for lead
1674      * surrogates that ends str. See example below.
1675      * <p>
1676      * If no such string str occurs in this source, then -1 is returned.
1677      * </p>
1678      * <p>
1679      * Examples:<br>
1680      * UTF16.indexOf("abc", "ab", 0) returns 0<br>
1681      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
1682      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
1683      * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
1684      * </p>
1685      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1686      * characters to its fullest.
1687      *
1688      * @param source UTF16 format Unicode string that will be searched
1689      * @param str UTF16 format Unicode string to search for
1690      * @param fromIndex The index to start the search from.
1691      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1692      *         -1 if the codepoint does not occur.
1693      * @stable ICU 2.6
1694      */
indexOf(String source, String str, int fromIndex)1695     public static int indexOf(String source, String str, int fromIndex) {
1696         int strLength = str.length();
1697         // non-surrogate ends
1698         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1699             return source.indexOf(str, fromIndex);
1700         }
1701 
1702         int result = source.indexOf(str, fromIndex);
1703         int resultEnd = result + strLength;
1704         if (result >= 0) {
1705             // check last character
1706             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1707                     && isTrailSurrogate(source.charAt(resultEnd))) {
1708                 return indexOf(source, str, resultEnd + 1);
1709             }
1710             // check first character which is a trail surrogate
1711             if (isTrailSurrogate(str.charAt(0)) && result > 0
1712                     && isLeadSurrogate(source.charAt(result - 1))) {
1713                 return indexOf(source, str, resultEnd + 1);
1714             }
1715         }
1716         return result;
1717     }
1718 
1719     /**
1720      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1721      * the argument codepoint. I.e., the index returned is the largest value i such that:
1722      * UTF16.charAt(source, i) == char32 is true.
1723      * <p>
1724      * Examples:<br>
1725      * UTF16.lastIndexOf("abc", 'a') returns 0<br>
1726      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1727      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1728      * </p>
1729      * <p>
1730      * source is searched backwards starting at the last character.
1731      * </p>
1732      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1733      * characters to its fullest.
1734      *
1735      * @param source UTF16 format Unicode string that will be searched
1736      * @param char32 Codepoint to search for
1737      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1738      *         does not occur.
1739      * @stable ICU 2.6
1740      */
lastIndexOf(String source, int char32)1741     public static int lastIndexOf(String source, int char32) {
1742         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1743             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1744         }
1745         // non-surrogate bmp
1746         if (char32 < LEAD_SURROGATE_MIN_VALUE
1747                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1748             return source.lastIndexOf((char) char32);
1749         }
1750         // surrogate
1751         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1752             int result = source.lastIndexOf((char) char32);
1753             if (result >= 0) {
1754                 if (isLeadSurrogate(char32) && (result < source.length() - 1)
1755                         && isTrailSurrogate(source.charAt(result + 1))) {
1756                     return lastIndexOf(source, char32, result - 1);
1757                 }
1758                 // trail surrogate
1759                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1760                     return lastIndexOf(source, char32, result - 1);
1761                 }
1762             }
1763             return result;
1764         }
1765         // supplementary
1766         String char32str = toString(char32);
1767         return source.lastIndexOf(char32str);
1768     }
1769 
1770     /**
1771      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1772      * the argument string str. This method is implemented based on codepoints, hence a "lead
1773      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1774      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1775      * character before str found at in source will not have a valid match. Vice versa for lead
1776      * surrogates that ends str. See example below.
1777      * <p>
1778      * Examples:<br>
1779      * UTF16.lastIndexOf("abc", "a") returns 0<br>
1780      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1781      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1782      * </p>
1783      * <p>
1784      * source is searched backwards starting at the last character.
1785      * </p>
1786      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1787      * characters to its fullest.
1788      *
1789      * @param source UTF16 format Unicode string that will be searched
1790      * @param str UTF16 format Unicode string to search for
1791      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1792      *         does not occur.
1793      * @stable ICU 2.6
1794      */
lastIndexOf(String source, String str)1795     public static int lastIndexOf(String source, String str) {
1796         int strLength = str.length();
1797         // non-surrogate ends
1798         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1799             return source.lastIndexOf(str);
1800         }
1801 
1802         int result = source.lastIndexOf(str);
1803         if (result >= 0) {
1804             // check last character
1805             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1806                     && isTrailSurrogate(source.charAt(result + strLength + 1))) {
1807                 return lastIndexOf(source, str, result - 1);
1808             }
1809             // check first character which is a trail surrogate
1810             if (isTrailSurrogate(str.charAt(0)) && result > 0
1811                     && isLeadSurrogate(source.charAt(result - 1))) {
1812                 return lastIndexOf(source, str, result - 1);
1813             }
1814         }
1815         return result;
1816     }
1817 
1818     /**
1819      * <p>
1820      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1821      * the argument codepoint, where the result is less than or equals to fromIndex.
1822      * </p>
1823      * <p>
1824      * This method is implemented based on codepoints, hence a single surrogate character will not
1825      * match a supplementary character.
1826      * </p>
1827      * <p>
1828      * source is searched backwards starting at the last character starting at the specified index.
1829      * </p>
1830      * <p>
1831      * Examples:<br>
1832      * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
1833      * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
1834      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
1835      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
1836      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1837      * </p>
1838      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1839      * characters to its fullest.
1840      *
1841      * @param source UTF16 format Unicode string that will be searched
1842      * @param char32 Codepoint to search for
1843      * @param fromIndex the index to start the search from. There is no restriction on the value of
1844      *            fromIndex. If it is greater than or equal to the length of this string, it has the
1845      *            same effect as if it were equal to one less than the length of this string: this
1846      *            entire string may be searched. If it is negative, it has the same effect as if it
1847      *            were -1: -1 is returned.
1848      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1849      *         does not occur.
1850      * @stable ICU 2.6
1851      */
lastIndexOf(String source, int char32, int fromIndex)1852     public static int lastIndexOf(String source, int char32, int fromIndex) {
1853         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1854             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1855         }
1856         // non-surrogate bmp
1857         if (char32 < LEAD_SURROGATE_MIN_VALUE
1858                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1859             return source.lastIndexOf((char) char32, fromIndex);
1860         }
1861         // surrogate
1862         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1863             int result = source.lastIndexOf((char) char32, fromIndex);
1864             if (result >= 0) {
1865                 if (isLeadSurrogate(char32) && (result < source.length() - 1)
1866                         && isTrailSurrogate(source.charAt(result + 1))) {
1867                     return lastIndexOf(source, char32, result - 1);
1868                 }
1869                 // trail surrogate
1870                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1871                     return lastIndexOf(source, char32, result - 1);
1872                 }
1873             }
1874             return result;
1875         }
1876         // supplementary
1877         String char32str = toString(char32);
1878         return source.lastIndexOf(char32str, fromIndex);
1879     }
1880 
1881     /**
1882      * <p>
1883      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1884      * the argument string str, where the result is less than or equals to fromIndex.
1885      * </p>
1886      * <p>
1887      * This method is implemented based on codepoints, hence a "lead surrogate character + trail
1888      * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate
1889      * character at index 0, a source with a leading a surrogate character before str found at in
1890      * source will not have a valid match. Vice versa for lead surrogates that ends str.
1891      * </p>
1892      * See example below.
1893      * <p>
1894      * Examples:<br>
1895      * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
1896      * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
1897      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
1898      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
1899      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
1900      * </p>
1901      * <p>
1902      * source is searched backwards starting at the last character.
1903      * </p>
1904      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1905      * characters to its fullest.
1906      *
1907      * @param source UTF16 format Unicode string that will be searched
1908      * @param str UTF16 format Unicode string to search for
1909      * @param fromIndex the index to start the search from. There is no restriction on the value of
1910      *            fromIndex. If it is greater than or equal to the length of this string, it has the
1911      *            same effect as if it were equal to one less than the length of this string: this
1912      *            entire string may be searched. If it is negative, it has the same effect as if it
1913      *            were -1: -1 is returned.
1914      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1915      *         does not occur.
1916      * @stable ICU 2.6
1917      */
lastIndexOf(String source, String str, int fromIndex)1918     public static int lastIndexOf(String source, String str, int fromIndex) {
1919         int strLength = str.length();
1920         // non-surrogate ends
1921         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1922             return source.lastIndexOf(str, fromIndex);
1923         }
1924 
1925         int result = source.lastIndexOf(str, fromIndex);
1926         if (result >= 0) {
1927             // check last character
1928             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1929                     && isTrailSurrogate(source.charAt(result + strLength))) {
1930                 return lastIndexOf(source, str, result - 1);
1931             }
1932             // check first character which is a trail surrogate
1933             if (isTrailSurrogate(str.charAt(0)) && result > 0
1934                     && isLeadSurrogate(source.charAt(result - 1))) {
1935                 return lastIndexOf(source, str, result - 1);
1936             }
1937         }
1938         return result;
1939     }
1940 
1941     /**
1942      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of
1943      * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16
1944      * format Unicode string source, then source will be returned. Otherwise, a new String object is
1945      * created that represents a codepoint sequence identical to the codepoint sequence represented
1946      * by source, except that every occurrence of oldChar32 is replaced by an occurrence of
1947      * newChar32.
1948      * <p>
1949      * Examples: <br>
1950      * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
1951      * returns "mosquito in your collar"<br>
1952      * UTF16.replace("JonL", 'q', 'x');<br>
1953      * returns "JonL" (no change)<br>
1954      * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br>
1955      * returns "Supplementary character !"<br>
1956      * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br>
1957      * returns "Supplementary character \ud800\udc00"<br>
1958      * </p>
1959      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1960      * characters to its fullest.
1961      *
1962      * @param source UTF16 format Unicode string which the codepoint replacements will be based on.
1963      * @param oldChar32 Non-zero old codepoint to be replaced.
1964      * @param newChar32 The new codepoint to replace oldChar32
1965      * @return new String derived from source by replacing every occurrence of oldChar32 with
1966      *         newChar32, unless when no oldChar32 is found in source then source will be returned.
1967      * @stable ICU 2.6
1968      */
replace(String source, int oldChar32, int newChar32)1969     public static String replace(String source, int oldChar32, int newChar32) {
1970         if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
1971             throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint");
1972         }
1973         if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
1974             throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint");
1975         }
1976 
1977         int index = indexOf(source, oldChar32);
1978         if (index == -1) {
1979             return source;
1980         }
1981         String newChar32Str = toString(newChar32);
1982         int oldChar32Size = 1;
1983         int newChar32Size = newChar32Str.length();
1984         StringBuffer result = new StringBuffer(source);
1985         int resultIndex = index;
1986 
1987         if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
1988             oldChar32Size = 2;
1989         }
1990 
1991         while (index != -1) {
1992             int endResultIndex = resultIndex + oldChar32Size;
1993             result.replace(resultIndex, endResultIndex, newChar32Str);
1994             int lastEndIndex = index + oldChar32Size;
1995             index = indexOf(source, oldChar32, lastEndIndex);
1996             resultIndex += newChar32Size + index - lastEndIndex;
1997         }
1998         return result.toString();
1999     }
2000 
2001     /**
2002      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr
2003      * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string
2004      * source, then source will be returned. Otherwise, a new String object is created that
2005      * represents a codepoint sequence identical to the codepoint sequence represented by source,
2006      * except that every occurrence of oldStr is replaced by an occurrence of newStr.
2007      * <p>
2008      * Examples: <br>
2009      * UTF16.replace("mesquite in your cellar", "e", "o");<br>
2010      * returns "mosquito in your collar"<br>
2011      * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
2012      * returns "cat in your cellar"<br>
2013      * UTF16.replace("JonL", "q", "x");<br>
2014      * returns "JonL" (no change)<br>
2015      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br>
2016      * returns "Supplementary character !"<br>
2017      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br>
2018      * returns "Supplementary character \ud800\udc00"<br>
2019      * </p>
2020      * Note this method is provided as support to jdk 1.3, which does not support supplementary
2021      * characters to its fullest.
2022      *
2023      * @param source UTF16 format Unicode string which the replacements will be based on.
2024      * @param oldStr Non-zero-length string to be replaced.
2025      * @param newStr The new string to replace oldStr
2026      * @return new String derived from source by replacing every occurrence of oldStr with newStr.
2027      *         When no oldStr is found in source, then source will be returned.
2028      * @stable ICU 2.6
2029      */
replace(String source, String oldStr, String newStr)2030     public static String replace(String source, String oldStr, String newStr) {
2031         int index = indexOf(source, oldStr);
2032         if (index == -1) {
2033             return source;
2034         }
2035         int oldStrSize = oldStr.length();
2036         int newStrSize = newStr.length();
2037         StringBuffer result = new StringBuffer(source);
2038         int resultIndex = index;
2039 
2040         while (index != -1) {
2041             int endResultIndex = resultIndex + oldStrSize;
2042             result.replace(resultIndex, endResultIndex, newStr);
2043             int lastEndIndex = index + oldStrSize;
2044             index = indexOf(source, oldStr, lastEndIndex);
2045             resultIndex += newStrSize + index - lastEndIndex;
2046         }
2047         return result.toString();
2048     }
2049 
2050     /**
2051      * Reverses a UTF16 format Unicode string and replaces source's content with it. This method
2052      * will reverse surrogate characters correctly, instead of blindly reversing every character.
2053      * <p>
2054      * Examples:<br>
2055      * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
2056      * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
2057      *
2058      * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed
2059      * @return a modified source with reversed UTF16 format Unicode string.
2060      * @stable ICU 2.6
2061      */
reverse(StringBuffer source)2062     public static StringBuffer reverse(StringBuffer source) {
2063         int length = source.length();
2064         StringBuffer result = new StringBuffer(length);
2065         for (int i = length; i-- > 0;) {
2066             char ch = source.charAt(i);
2067             if (isTrailSurrogate(ch) && i > 0) {
2068                 char ch2 = source.charAt(i - 1);
2069                 if (isLeadSurrogate(ch2)) {
2070                     result.append(ch2);
2071                     result.append(ch);
2072                     --i;
2073                     continue;
2074                 }
2075             }
2076             result.append(ch);
2077         }
2078         return result;
2079     }
2080 
2081     /**
2082      * Check if the string contains more Unicode code points than a certain number. This is more
2083      * efficient than counting all code points in the entire string and comparing that number with a
2084      * threshold. This function may not need to scan the string at all if the length is within a
2085      * certain range, and never needs to count more than 'number + 1' code points. Logically
2086      * equivalent to (countCodePoint(s) &gt; number). A Unicode code point may occupy either one or two
2087      * code units.
2088      *
2089      * @param source The input string.
2090      * @param number The number of code points in the string is compared against the 'number'
2091      *            parameter.
2092      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2093      * @stable ICU 2.4
2094      */
hasMoreCodePointsThan(String source, int number)2095     public static boolean hasMoreCodePointsThan(String source, int number) {
2096         if (number < 0) {
2097             return true;
2098         }
2099         if (source == null) {
2100             return false;
2101         }
2102         int length = source.length();
2103 
2104         // length >= 0 known
2105         // source contains at least (length + 1) / 2 code points: <= 2
2106         // chars per cp
2107         if (((length + 1) >> 1) > number) {
2108             return true;
2109         }
2110 
2111         // check if source does not even contain enough chars
2112         int maxsupplementary = length - number;
2113         if (maxsupplementary <= 0) {
2114             return false;
2115         }
2116 
2117         // there are maxsupplementary = length - number more chars than
2118         // asked-for code points
2119 
2120         // count code points until they exceed and also check that there are
2121         // no more than maxsupplementary supplementary code points (char pairs)
2122         int start = 0;
2123         while (true) {
2124             if (length == 0) {
2125                 return false;
2126             }
2127             if (number == 0) {
2128                 return true;
2129             }
2130             if (isLeadSurrogate(source.charAt(start++)) && start != length
2131                     && isTrailSurrogate(source.charAt(start))) {
2132                 start++;
2133                 if (--maxsupplementary <= 0) {
2134                     // too many pairs - too few code points
2135                     return false;
2136                 }
2137             }
2138             --number;
2139         }
2140     }
2141 
2142     /**
2143      * Check if the sub-range of char array, from argument start to limit, contains more Unicode
2144      * code points than a certain number. This is more efficient than counting all code points in
2145      * the entire char array range and comparing that number with a threshold. This function may not
2146      * need to scan the char array at all if start and limit is within a certain range, and never
2147      * needs to count more than 'number + 1' code points. Logically equivalent to
2148      * (countCodePoint(source, start, limit) &gt; number). A Unicode code point may occupy either one
2149      * or two code units.
2150      *
2151      * @param source Array of UTF-16 chars
2152      * @param start Offset to substring in the source array for analyzing
2153      * @param limit Offset to substring in the source array for analyzing
2154      * @param number The number of code points in the string is compared against the 'number'
2155      *            parameter.
2156      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2157      * @exception IndexOutOfBoundsException Thrown when limit &lt; start
2158      * @stable ICU 2.4
2159      */
hasMoreCodePointsThan(char source[], int start, int limit, int number)2160     public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
2161         int length = limit - start;
2162         if (length < 0 || start < 0 || limit < 0) {
2163             throw new IndexOutOfBoundsException(
2164                     "Start and limit indexes should be non-negative and start <= limit");
2165         }
2166         if (number < 0) {
2167             return true;
2168         }
2169         if (source == null) {
2170             return false;
2171         }
2172 
2173         // length >= 0 known
2174         // source contains at least (length + 1) / 2 code points: <= 2
2175         // chars per cp
2176         if (((length + 1) >> 1) > number) {
2177             return true;
2178         }
2179 
2180         // check if source does not even contain enough chars
2181         int maxsupplementary = length - number;
2182         if (maxsupplementary <= 0) {
2183             return false;
2184         }
2185 
2186         // there are maxsupplementary = length - number more chars than
2187         // asked-for code points
2188 
2189         // count code points until they exceed and also check that there are
2190         // no more than maxsupplementary supplementary code points (char pairs)
2191         while (true) {
2192             if (length == 0) {
2193                 return false;
2194             }
2195             if (number == 0) {
2196                 return true;
2197             }
2198             if (isLeadSurrogate(source[start++]) && start != limit
2199                     && isTrailSurrogate(source[start])) {
2200                 start++;
2201                 if (--maxsupplementary <= 0) {
2202                     // too many pairs - too few code points
2203                     return false;
2204                 }
2205             }
2206             --number;
2207         }
2208     }
2209 
2210     /**
2211      * Check if the string buffer contains more Unicode code points than a certain number. This is
2212      * more efficient than counting all code points in the entire string buffer and comparing that
2213      * number with a threshold. This function may not need to scan the string buffer at all if the
2214      * length is within a certain range, and never needs to count more than 'number + 1' code
2215      * points. Logically equivalent to (countCodePoint(s) &gt; number). A Unicode code point may
2216      * occupy either one or two code units.
2217      *
2218      * @param source The input string buffer.
2219      * @param number The number of code points in the string buffer is compared against the 'number'
2220      *            parameter.
2221      * @return boolean value for whether the string buffer contains more Unicode code points than
2222      *         'number'.
2223      * @stable ICU 2.4
2224      */
hasMoreCodePointsThan(StringBuffer source, int number)2225     public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
2226         if (number < 0) {
2227             return true;
2228         }
2229         if (source == null) {
2230             return false;
2231         }
2232         int length = source.length();
2233 
2234         // length >= 0 known
2235         // source contains at least (length + 1) / 2 code points: <= 2
2236         // chars per cp
2237         if (((length + 1) >> 1) > number) {
2238             return true;
2239         }
2240 
2241         // check if source does not even contain enough chars
2242         int maxsupplementary = length - number;
2243         if (maxsupplementary <= 0) {
2244             return false;
2245         }
2246 
2247         // there are maxsupplementary = length - number more chars than
2248         // asked-for code points
2249 
2250         // count code points until they exceed and also check that there are
2251         // no more than maxsupplementary supplementary code points (char pairs)
2252         int start = 0;
2253         while (true) {
2254             if (length == 0) {
2255                 return false;
2256             }
2257             if (number == 0) {
2258                 return true;
2259             }
2260             if (isLeadSurrogate(source.charAt(start++)) && start != length
2261                     && isTrailSurrogate(source.charAt(start))) {
2262                 start++;
2263                 if (--maxsupplementary <= 0) {
2264                     // too many pairs - too few code points
2265                     return false;
2266                 }
2267             }
2268             --number;
2269         }
2270     }
2271 
2272     /**
2273      * Cover JDK 1.5 API. Create a String from an array of codePoints.
2274      *
2275      * @param codePoints The code array
2276      * @param offset The start of the text in the code point array
2277      * @param count The number of code points
2278      * @return a String representing the code points between offset and count
2279      * @throws IllegalArgumentException If an invalid code point is encountered
2280      * @throws IndexOutOfBoundsException If the offset or count are out of bounds.
2281      * @stable ICU 3.0
2282      */
newString(int[] codePoints, int offset, int count)2283     public static String newString(int[] codePoints, int offset, int count) {
2284         if (count < 0) {
2285             throw new IllegalArgumentException();
2286         }
2287         char[] chars = new char[count];
2288         int w = 0;
2289         for (int r = offset, e = offset + count; r < e; ++r) {
2290             int cp = codePoints[r];
2291             if (cp < 0 || cp > 0x10ffff) {
2292                 throw new IllegalArgumentException();
2293             }
2294             while (true) {
2295                 try {
2296                     if (cp < 0x010000) {
2297                         chars[w] = (char) cp;
2298                         w++;
2299                     } else {
2300                         chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
2301                         chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
2302                         w += 2;
2303                     }
2304                     break;
2305                 } catch (IndexOutOfBoundsException ex) {
2306                     int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
2307                             / (r - offset + 1)));
2308                     char[] temp = new char[newlen];
2309                     System.arraycopy(chars, 0, temp, 0, w);
2310                     chars = temp;
2311                 }
2312             }
2313         }
2314         return new String(chars, 0, w);
2315     }
2316 
2317     /**
2318      * <p>
2319      * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various
2320      * modes
2321      * </p>
2322      * <ul>
2323      * <li> Code point comparison or code unit comparison
2324      * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison
2325      * with special handling for character 'i'.
2326      * </ul>
2327      * <p>
2328      * The code unit or code point comparison differ only when comparing supplementary code points
2329      * (&#92;u10000..&#92;u10ffff) to BMP code points near the end of the BMP (i.e.,
2330      * &#92;ue000..&#92;uffff). In code unit comparison, high BMP code points sort after
2331      * supplementary code points because they are stored as pairs of surrogates which are at
2332      * &#92;ud800..&#92;udfff.
2333      * </p>
2334      *
2335      * @see #FOLD_CASE_DEFAULT
2336      * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2337      * @stable ICU 2.1
2338      */
2339     public static final class StringComparator implements java.util.Comparator<String> {
2340         // public constructor ------------------------------------------------
2341 
2342         /**
2343          * Default constructor that does code unit comparison and case sensitive comparison.
2344          *
2345          * @stable ICU 2.1
2346          */
StringComparator()2347         public StringComparator() {
2348             this(false, false, FOLD_CASE_DEFAULT);
2349         }
2350 
2351         /**
2352          * Constructor that does comparison based on the argument options.
2353          *
2354          * @param codepointcompare Flag to indicate true for code point comparison or false for code unit
2355          *            comparison.
2356          * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison
2357          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2358          *            when ignorecase is set to true. If ignorecase is false, this option is
2359          *            ignored.
2360          * @see #FOLD_CASE_DEFAULT
2361          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2362          * @throws IllegalArgumentException If foldcaseoption is out of range
2363          * @stable ICU 2.4
2364          */
StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption)2365         public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) {
2366             setCodePointCompare(codepointcompare);
2367             m_ignoreCase_ = ignorecase;
2368             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2369                 throw new IllegalArgumentException("Invalid fold case option");
2370             }
2371             m_foldCase_ = foldcaseoption;
2372         }
2373 
2374         // public data member ------------------------------------------------
2375 
2376         /**
2377          * Option value for case folding comparison:
2378          *
2379          * <p>Comparison is case insensitive, strings are folded using default mappings defined in
2380          * Unicode data file CaseFolding.txt, before comparison.
2381          *
2382          * @stable ICU 2.4
2383          */
2384         public static final int FOLD_CASE_DEFAULT = 0;
2385 
2386         /**
2387          * Option value for case folding:
2388          * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
2389          * and dotless i appropriately for Turkic languages (tr, az).
2390          *
2391          * <p>Comparison is case insensitive, strings are folded using modified mappings defined in
2392          * Unicode data file CaseFolding.txt, before comparison.
2393          *
2394          * @stable ICU 2.4
2395          * @see com.ibm.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
2396          */
2397         public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
2398 
2399         // public methods ----------------------------------------------------
2400 
2401         // public setters ----------------------------------------------------
2402 
2403         /**
2404          * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode
2405          * is set to code unit compare
2406          *
2407          * @param flag True for code point compare, false for code unit compare
2408          * @stable ICU 2.4
2409          */
setCodePointCompare(boolean flag)2410         public void setCodePointCompare(boolean flag) {
2411             if (flag) {
2412                 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
2413             } else {
2414                 m_codePointCompare_ = 0;
2415             }
2416         }
2417 
2418         /**
2419          * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise
2420          * case sensitive comparison mode if set to false.
2421          *
2422          * @param ignorecase True for case-insensitive comparison, false for case sensitive comparison
2423          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2424          *            when ignorecase is set to true. If ignorecase is false, this option is
2425          *            ignored.
2426          * @see #FOLD_CASE_DEFAULT
2427          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2428          * @stable ICU 2.4
2429          */
setIgnoreCase(boolean ignorecase, int foldcaseoption)2430         public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
2431             m_ignoreCase_ = ignorecase;
2432             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2433                 throw new IllegalArgumentException("Invalid fold case option");
2434             }
2435             m_foldCase_ = foldcaseoption;
2436         }
2437 
2438         // public getters ----------------------------------------------------
2439 
2440         /**
2441          * Checks if the comparison mode is code point compare.
2442          *
2443          * @return true for code point compare, false for code unit compare
2444          * @stable ICU 2.4
2445          */
getCodePointCompare()2446         public boolean getCodePointCompare() {
2447             return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2448         }
2449 
2450         /**
2451          * Checks if Comparator is in the case insensitive mode.
2452          *
2453          * @return true if Comparator performs case insensitive comparison, false otherwise
2454          * @stable ICU 2.4
2455          */
getIgnoreCase()2456         public boolean getIgnoreCase() {
2457             return m_ignoreCase_;
2458         }
2459 
2460         /**
2461          * Gets the fold case options set in Comparator to be used with case insensitive comparison.
2462          *
2463          * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
2464          * @see #FOLD_CASE_DEFAULT
2465          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2466          * @stable ICU 2.4
2467          */
getIgnoreCaseOption()2468         public int getIgnoreCaseOption() {
2469             return m_foldCase_;
2470         }
2471 
2472         // public other methods ----------------------------------------------
2473 
2474         /**
2475          * Compare two strings depending on the options selected during construction.
2476          *
2477          * @param a first source string.
2478          * @param b second source string.
2479          * @return 0 returned if a == b. If a &lt; b, a negative value is returned. Otherwise if a &gt; b,
2480          *         a positive value is returned.
2481          * @exception ClassCastException thrown when either a or b is not a String object
2482          * @stable ICU 4.4
2483          */
2484         @Override
compare(String a, String b)2485         public int compare(String a, String b) {
2486             if (Utility.sameObjects(a, b)) {
2487                 return 0;
2488             }
2489             if (a == null) {
2490                 return -1;
2491             }
2492             if (b == null) {
2493                 return 1;
2494             }
2495 
2496             if (m_ignoreCase_) {
2497                 return compareCaseInsensitive(a, b);
2498             }
2499             return compareCaseSensitive(a, b);
2500         }
2501 
2502         // private data member ----------------------------------------------
2503 
2504         /**
2505          * Code unit comparison flag. True if code unit comparison is required. False if code point
2506          * comparison is required.
2507          */
2508         private int m_codePointCompare_;
2509 
2510         /**
2511          * Fold case comparison option.
2512          */
2513         private int m_foldCase_;
2514 
2515         /**
2516          * Flag indicator if ignore case is to be used during comparison
2517          */
2518         private boolean m_ignoreCase_;
2519 
2520         /**
2521          * Code point order offset for surrogate characters
2522          */
2523         private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
2524 
2525         // private method ---------------------------------------------------
2526 
2527         /**
2528          * Compares case insensitive. This is a direct port of ICU4C, to make maintenance life
2529          * easier.
2530          *
2531          * @param s1
2532          *            first string to compare
2533          * @param s2
2534          *            second string to compare
2535          * @return -1 is s1 &lt; s2, 0 if equals,
2536          */
compareCaseInsensitive(String s1, String s2)2537         private int compareCaseInsensitive(String s1, String s2) {
2538             return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_
2539                     | Normalizer.COMPARE_IGNORE_CASE);
2540         }
2541 
2542         /**
2543          * Compares case sensitive. This is a direct port of ICU4C, to make maintenance life
2544          * easier.
2545          *
2546          * @param s1
2547          *            first string to compare
2548          * @param s2
2549          *            second string to compare
2550          * @return -1 is s1 &lt; s2, 0 if equals,
2551          */
compareCaseSensitive(String s1, String s2)2552         private int compareCaseSensitive(String s1, String s2) {
2553             // compare identical prefixes - they do not need to be fixed up
2554             // limit1 = start1 + min(length1, length2)
2555             int length1 = s1.length();
2556             int length2 = s2.length();
2557             int minlength = length1;
2558             int result = 0;
2559             if (length1 < length2) {
2560                 result = -1;
2561             } else if (length1 > length2) {
2562                 result = 1;
2563                 minlength = length2;
2564             }
2565 
2566             char c1 = 0;
2567             char c2 = 0;
2568             int index = 0;
2569             for (; index < minlength; index++) {
2570                 c1 = s1.charAt(index);
2571                 c2 = s2.charAt(index);
2572                 // check pseudo-limit
2573                 if (c1 != c2) {
2574                     break;
2575                 }
2576             }
2577 
2578             if (index == minlength) {
2579                 return result;
2580             }
2581 
2582             boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2583             // if both values are in or above the surrogate range, fix them up
2584             if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE
2585                     && codepointcompare) {
2586                 // subtract 0x2800 from BMP code points to make them smaller
2587                 // than supplementary ones
2588                 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1)))
2589                         || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) {
2590                     // part of a surrogate pair, leave >=d800
2591                 } else {
2592                     // BMP code point - may be surrogate code point - make
2593                     // < d800
2594                     c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2595                 }
2596 
2597                 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1)))
2598                         || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) {
2599                     // part of a surrogate pair, leave >=d800
2600                 } else {
2601                     // BMP code point - may be surrogate code point - make <d800
2602                     c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2603                 }
2604             }
2605 
2606             // now c1 and c2 are in UTF-32-compatible order
2607             return c1 - c2;
2608         }
2609     }
2610 
2611     /**
2612      * Utility for getting a code point from a CharSequence that contains exactly one code point.
2613      * @return the code point IF the string is non-null and consists of a single code point.
2614      * otherwise returns -1.
2615      * @param s to test
2616      * @stable ICU 54
2617      */
getSingleCodePoint(CharSequence s)2618     public static int getSingleCodePoint(CharSequence s) {
2619         if (s == null || s.length() == 0) {
2620             return -1;
2621         } else if (s.length() == 1) {
2622             return s.charAt(0);
2623         } else if (s.length() > 2) {
2624             return -1;
2625         }
2626 
2627         // at this point, len = 2
2628         int cp = Character.codePointAt(s, 0);
2629         if (cp > 0xFFFF) { // is surrogate pair
2630             return cp;
2631         }
2632         return -1;
2633     }
2634 
2635     /**
2636      * Utility for comparing a code point to a string without having to create a new string. Returns the same results
2637      * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if
2638      * <pre>
2639      * sc = new StringComparator(true,false,0);
2640      * fast = UTF16.compareCodePoint(codePoint, charSequence)
2641      * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString())
2642      * </pre>
2643      * then
2644      * <pre>
2645      * Integer.signum(fast) == Integer.signum(slower)
2646      * </pre>
2647      * @param codePoint to test
2648      * @param s to test
2649      * @return equivalent of code point comparator comparing two strings.
2650      * @stable ICU 54
2651      */
compareCodePoint(int codePoint, CharSequence s)2652     public static int compareCodePoint(int codePoint, CharSequence s) {
2653         if (s == null) {
2654             return 1;
2655         }
2656         final int strLen = s.length();
2657         if (strLen == 0) {
2658             return 1;
2659         }
2660         int second = Character.codePointAt(s, 0);
2661         int diff = codePoint - second;
2662         if (diff != 0) {
2663             return diff;
2664         }
2665         return strLen == Character.charCount(codePoint) ? 0 : -1;
2666     }
2667 
2668     // private data members -------------------------------------------------
2669 
2670     /**
2671      * Shift value for lead surrogate to form a supplementary character.
2672      */
2673     private static final int LEAD_SURROGATE_SHIFT_ = 10;
2674 
2675     /**
2676      * Mask to retrieve the significant value from a trail surrogate.
2677      */
2678     private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
2679 
2680     /**
2681      * Value that all lead surrogate starts with
2682      */
2683     private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
2684             - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
2685 
2686     // private methods ------------------------------------------------------
2687 
2688     /**
2689      * <p>
2690      * Converts argument code point and returns a String object representing the code point's value
2691      * in UTF16 format.
2692      * </p>
2693      * <p>
2694      * This method does not check for the validity of the codepoint, the results are not guaranteed
2695      * if a invalid codepoint is passed as argument.
2696      * </p>
2697      * <p>
2698      * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise.
2699      * </p>
2700      *
2701      * @param ch
2702      *            code point
2703      * @return string representation of the code point
2704      */
toString(int ch)2705     private static String toString(int ch) {
2706         if (ch < SUPPLEMENTARY_MIN_VALUE) {
2707             return String.valueOf((char) ch);
2708         }
2709 
2710         StringBuilder result = new StringBuilder();
2711         result.append(getLeadSurrogate(ch));
2712         result.append(getTrailSurrogate(ch));
2713         return result.toString();
2714     }
2715 }
2716 // eof
2717