• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /**
5  *******************************************************************************
6  * Copyright (C) 1996-2016, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 
11 package ohos.global.icu.text;
12 
13 import ohos.global.icu.impl.Utility;
14 
15 /**
16  * <p>
17  * Standalone utility class providing UTF16 character conversions and indexing conversions.
18  * </p>
19  * <p>
20  * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
21  * so searching for strings is a safe operation. Similarly, concatenation is always safe.
22  * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
23  * values for start and end are on those boundaries, since they arose from operations like
24  * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
25  * </p>
26  * <strong>Examples:</strong>
27  * <p>
28  * The following examples illustrate use of some of these methods.
29  *
30  * <pre>
31  * // iteration forwards: Original
32  * for (int i = 0; i &lt; s.length(); ++i) {
33  *     char ch = s.charAt(i);
34  *     doSomethingWith(ch);
35  * }
36  *
37  * // iteration forwards: Changes for UTF-32
38  * int ch;
39  * for (int i = 0; i &lt; s.length(); i += UTF16.getCharCount(ch)) {
40  *     ch = UTF16.charAt(s, i);
41  *     doSomethingWith(ch);
42  * }
43  *
44  * // iteration backwards: Original
45  * for (int i = s.length() - 1; i &gt;= 0; --i) {
46  *     char ch = s.charAt(i);
47  *     doSomethingWith(ch);
48  * }
49  *
50  * // iteration backwards: Changes for UTF-32
51  * int ch;
52  * for (int i = s.length() - 1; i &gt; 0; i -= UTF16.getCharCount(ch)) {
53  *     ch = UTF16.charAt(s, i);
54  *     doSomethingWith(ch);
55  * }
56  * </pre>
57  *
58  * <strong>Notes:</strong>
59  * <ul>
60  * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code>
61  * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string.
62  * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16
63  * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32
64  * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li>
65  * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a
66  * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16
67  * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
68  * </li>
69  * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out
70  * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates
71  * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to
72  * check for validity if desired. </li>
73  * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then
74  * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It
75  * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4,
76  * 5.5). </li>
77  * <li> <strong>Optimization:</strong> The method implementations may need optimization if the
78  * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small
79  * percentage of all the text in the world, the singleton case should always be optimized for. </li>
80  * </ul>
81  *
82  * @author Mark Davis, with help from Markus Scherer
83  * @hide exposed on OHOS
84  */
85 
86 public final class UTF16 {
87     // public variables ---------------------------------------------------
88 
89     /**
90      * Value returned in {@link #bounds(String, int) bounds()}.
91      * These values are chosen specifically so that it actually represents the position of the
92      * character [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)]
93      */
94     public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
95             TRAIL_SURROGATE_BOUNDARY = 5;
96 
97     /**
98      * The lowest Unicode code point value.
99      */
100     public static final int CODEPOINT_MIN_VALUE = 0;
101 
102     /**
103      * The highest Unicode code point value (scalar value) according to the Unicode Standard.
104      */
105     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
106 
107     /**
108      * The minimum value for Supplementary code points
109      */
110     public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
111 
112     /**
113      * Lead surrogate minimum value
114      */
115     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
116 
117     /**
118      * Trail surrogate minimum value
119      */
120     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
121 
122     /**
123      * Lead surrogate maximum value
124      */
125     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
126 
127     /**
128      * Trail surrogate maximum value
129      */
130     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
131 
132     /**
133      * Surrogate minimum value
134      */
135     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
136 
137     /**
138      * Maximum surrogate value
139      */
140     public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
141 
142     /**
143      * Lead surrogate bitmask
144      */
145     private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
146 
147     /**
148      * Trail surrogate bitmask
149      */
150     private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
151 
152     /**
153      * Surrogate bitmask
154      */
155     private static final int SURROGATE_BITMASK = 0xFFFFF800;
156 
157     /**
158      * Lead surrogate bits
159      */
160     private static final int LEAD_SURROGATE_BITS = 0xD800;
161 
162     /**
163      * Trail surrogate bits
164      */
165     private static final int TRAIL_SURROGATE_BITS = 0xDC00;
166 
167     /**
168      * Surrogate bits
169      */
170     private static final int SURROGATE_BITS = 0xD800;
171 
172     // constructor --------------------------------------------------------
173 
174     // /CLOVER:OFF
175     /**
176      * Prevent instance from being created.
177      */
UTF16()178     private UTF16() {
179     }
180 
181     // /CLOVER:ON
182     // public method ------------------------------------------------------
183 
184     /**
185      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
186      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
187      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
188      * UCharacter.isLegal()</a></code>
189      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
190      * character will be returned. If a complete supplementary character is not found the incomplete
191      * character will be returned
192      *
193      * @param source Array of UTF-16 chars
194      * @param offset16 UTF-16 offset to the start of the character.
195      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
196      *         of that codepoint are the same as in <code>bounds32()</code>.
197      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
198      */
charAt(String source, int offset16)199     public static int charAt(String source, int offset16) {
200         char single = source.charAt(offset16);
201         if (single < LEAD_SURROGATE_MIN_VALUE) {
202             return single;
203         }
204         return _charAt(source, offset16, single);
205     }
206 
_charAt(String source, int offset16, char single)207     private static int _charAt(String source, int offset16, char single) {
208         if (single > TRAIL_SURROGATE_MAX_VALUE) {
209             return single;
210         }
211 
212         // Convert the UTF-16 surrogate pair if necessary.
213         // For simplicity in usage, and because the frequency of pairs is
214         // low, look both directions.
215 
216         if (single <= LEAD_SURROGATE_MAX_VALUE) {
217             ++offset16;
218             if (source.length() != offset16) {
219                 char trail = source.charAt(offset16);
220                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
221                     return Character.toCodePoint(single, trail);
222                 }
223             }
224         } else {
225             --offset16;
226             if (offset16 >= 0) {
227                 // single is a trail surrogate so
228                 char lead = source.charAt(offset16);
229                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
230                     return Character.toCodePoint(lead, single);
231                 }
232             }
233         }
234         return single; // return unmatched surrogate
235     }
236 
237     /**
238      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
239      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
240      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
241      * UCharacter.isLegal()</a></code>
242      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
243      * character will be returned. If a complete supplementary character is not found the incomplete
244      * character will be returned
245      *
246      * @param source Array of UTF-16 chars
247      * @param offset16 UTF-16 offset to the start of the character.
248      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
249      *         of that codepoint are the same as in <code>bounds32()</code>.
250      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
251      */
charAt(CharSequence source, int offset16)252     public static int charAt(CharSequence source, int offset16) {
253         char single = source.charAt(offset16);
254         if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
255             return single;
256         }
257         return _charAt(source, offset16, single);
258     }
259 
_charAt(CharSequence source, int offset16, char single)260     private static int _charAt(CharSequence source, int offset16, char single) {
261         if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
262             return single;
263         }
264 
265         // Convert the UTF-16 surrogate pair if necessary.
266         // For simplicity in usage, and because the frequency of pairs is
267         // low, look both directions.
268 
269         if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
270             ++offset16;
271             if (source.length() != offset16) {
272                 char trail = source.charAt(offset16);
273                 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
274                         && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
275                     return Character.toCodePoint(single, trail);
276                 }
277             }
278         } else {
279             --offset16;
280             if (offset16 >= 0) {
281                 // single is a trail surrogate so
282                 char lead = source.charAt(offset16);
283                 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
284                         && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
285                     return Character.toCodePoint(lead, single);
286                 }
287             }
288         }
289         return single; // return unmatched surrogate
290     }
291 
292     /**
293      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
294      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
295      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
296      * </a></code>
297      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
298      * character will be returned. If a complete supplementary character is not found the incomplete
299      * character will be returned
300      *
301      * @param source UTF-16 chars string buffer
302      * @param offset16 UTF-16 offset to the start of the character.
303      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
304      *         of that codepoint are the same as in <code>bounds32()</code>.
305      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
306      */
charAt(StringBuffer source, int offset16)307     public static int charAt(StringBuffer source, int offset16) {
308         if (offset16 < 0 || offset16 >= source.length()) {
309             throw new StringIndexOutOfBoundsException(offset16);
310         }
311 
312         char single = source.charAt(offset16);
313         if (!isSurrogate(single)) {
314             return single;
315         }
316 
317         // Convert the UTF-16 surrogate pair if necessary.
318         // For simplicity in usage, and because the frequency of pairs is
319         // low, look both directions.
320 
321         if (single <= LEAD_SURROGATE_MAX_VALUE) {
322             ++offset16;
323             if (source.length() != offset16) {
324                 char trail = source.charAt(offset16);
325                 if (isTrailSurrogate(trail))
326                     return Character.toCodePoint(single, trail);
327             }
328         } else {
329             --offset16;
330             if (offset16 >= 0) {
331                 // single is a trail surrogate so
332                 char lead = source.charAt(offset16);
333                 if (isLeadSurrogate(lead)) {
334                     return Character.toCodePoint(lead, single);
335                 }
336             }
337         }
338         return single; // return unmatched surrogate
339     }
340 
341     /**
342      * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
343      * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
344      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
345      * </a></code>
346      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
347      * character will be returned. If a complete supplementary character is not found the incomplete
348      * character will be returned
349      *
350      * @param source Array of UTF-16 chars
351      * @param start Offset to substring in the source array for analyzing
352      * @param limit Offset to substring in the source array for analyzing
353      * @param offset16 UTF-16 offset relative to start
354      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
355      *         of that codepoint are the same as in <code>bounds32()</code>.
356      * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
357      */
charAt(char source[], int start, int limit, int offset16)358     public static int charAt(char source[], int start, int limit, int offset16) {
359         offset16 += start;
360         if (offset16 < start || offset16 >= limit) {
361             throw new ArrayIndexOutOfBoundsException(offset16);
362         }
363 
364         char single = source[offset16];
365         if (!isSurrogate(single)) {
366             return single;
367         }
368 
369         // Convert the UTF-16 surrogate pair if necessary.
370         // For simplicity in usage, and because the frequency of pairs is
371         // low, look both directions.
372         if (single <= LEAD_SURROGATE_MAX_VALUE) {
373             offset16++;
374             if (offset16 >= limit) {
375                 return single;
376             }
377             char trail = source[offset16];
378             if (isTrailSurrogate(trail)) {
379                 return Character.toCodePoint(single, trail);
380             }
381         } else { // isTrailSurrogate(single), so
382             if (offset16 == start) {
383                 return single;
384             }
385             offset16--;
386             char lead = source[offset16];
387             if (isLeadSurrogate(lead))
388                 return Character.toCodePoint(lead, single);
389         }
390         return single; // return unmatched surrogate
391     }
392 
393     /**
394      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
395      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
396      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
397      * </a></code>
398      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
399      * character will be returned. If a complete supplementary character is not found the incomplete
400      * character will be returned
401      *
402      * @param source UTF-16 chars string buffer
403      * @param offset16 UTF-16 offset to the start of the character.
404      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
405      *         of that codepoint are the same as in <code>bounds32()</code>.
406      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
407      */
charAt(Replaceable source, int offset16)408     public static int charAt(Replaceable source, int offset16) {
409         if (offset16 < 0 || offset16 >= source.length()) {
410             throw new StringIndexOutOfBoundsException(offset16);
411         }
412 
413         char single = source.charAt(offset16);
414         if (!isSurrogate(single)) {
415             return single;
416         }
417 
418         // Convert the UTF-16 surrogate pair if necessary.
419         // For simplicity in usage, and because the frequency of pairs is
420         // low, look both directions.
421 
422         if (single <= LEAD_SURROGATE_MAX_VALUE) {
423             ++offset16;
424             if (source.length() != offset16) {
425                 char trail = source.charAt(offset16);
426                 if (isTrailSurrogate(trail))
427                     return Character.toCodePoint(single, trail);
428             }
429         } else {
430             --offset16;
431             if (offset16 >= 0) {
432                 // single is a trail surrogate so
433                 char lead = source.charAt(offset16);
434                 if (isLeadSurrogate(lead)) {
435                     return Character.toCodePoint(lead, single);
436                 }
437             }
438         }
439         return single; // return unmatched surrogate
440     }
441 
442     /**
443      * Determines how many chars this char32 requires. If a validity check is required, use <code>
444      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
445      * on char32 before calling.
446      *
447      * @param char32 The input codepoint.
448      * @return 2 if is in supplementary space, otherwise 1.
449      */
getCharCount(int char32)450     public static int getCharCount(int char32) {
451         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
452             return 1;
453         }
454         return 2;
455     }
456 
457     /**
458      * Returns the type of the boundaries around the char at offset16. Used for random access.
459      *
460      * @param source Text to analyse
461      * @param offset16 UTF-16 offset
462      * @return
463      *            <ul>
464      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1]
465      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
466      *            are [offset16, offset16 + 2]
467      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
468      *            bounds are [offset16 - 1, offset16 + 1]
469      *            </ul>
470      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
471      *            can be gotten by: [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)].
472      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
473      */
bounds(String source, int offset16)474     public static int bounds(String source, int offset16) {
475         char ch = source.charAt(offset16);
476         if (isSurrogate(ch)) {
477             if (isLeadSurrogate(ch)) {
478                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
479                     return LEAD_SURROGATE_BOUNDARY;
480                 }
481             } else {
482                 // isTrailSurrogate(ch), so
483                 --offset16;
484                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
485                     return TRAIL_SURROGATE_BOUNDARY;
486                 }
487             }
488         }
489         return SINGLE_CHAR_BOUNDARY;
490     }
491 
492     /**
493      * Returns the type of the boundaries around the char at offset16. Used for random access.
494      *
495      * @param source String buffer to analyse
496      * @param offset16 UTF16 offset
497      * @return
498      *            <ul>
499      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1]
500      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
501      *            are [offset16, offset16 + 2]
502      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
503      *            bounds are [offset16 - 1, offset16 + 1]
504      *            </ul>
505      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
506      *            can be gotten by: [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)].
507      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
508      */
bounds(StringBuffer source, int offset16)509     public static int bounds(StringBuffer source, int offset16) {
510         char ch = source.charAt(offset16);
511         if (isSurrogate(ch)) {
512             if (isLeadSurrogate(ch)) {
513                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
514                     return LEAD_SURROGATE_BOUNDARY;
515                 }
516             } else {
517                 // isTrailSurrogate(ch), so
518                 --offset16;
519                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
520                     return TRAIL_SURROGATE_BOUNDARY;
521                 }
522             }
523         }
524         return SINGLE_CHAR_BOUNDARY;
525     }
526 
527     /**
528      * Returns the type of the boundaries around the char at offset16. Used for random access. Note
529      * that the boundaries are determined with respect to the subarray, hence the char array
530      * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
531      *
532      * @param source Char array to analyse
533      * @param start Offset to substring in the source array for analyzing
534      * @param limit Offset to substring in the source array for analyzing
535      * @param offset16 UTF16 offset relative to start
536      * @return
537      *            <ul>
538      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
539      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
540      *            are [offset16, offset16 + 2]
541      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
542      *            bounds are [offset16 - 1, offset16 + 1]
543      *            </ul>
544      *            For bit-twiddlers, the boundary values for these are chosen so that the boundaries
545      *            can be gotten by: [offset16 - (boundvalue &gt;&gt; 2), offset16 + (boundvalue &amp; 3)].
546      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
547      */
bounds(char source[], int start, int limit, int offset16)548     public static int bounds(char source[], int start, int limit, int offset16) {
549         offset16 += start;
550         if (offset16 < start || offset16 >= limit) {
551             throw new ArrayIndexOutOfBoundsException(offset16);
552         }
553         char ch = source[offset16];
554         if (isSurrogate(ch)) {
555             if (isLeadSurrogate(ch)) {
556                 ++offset16;
557                 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
558                     return LEAD_SURROGATE_BOUNDARY;
559                 }
560             } else { // isTrailSurrogate(ch), so
561                 --offset16;
562                 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
563                     return TRAIL_SURROGATE_BOUNDARY;
564                 }
565             }
566         }
567         return SINGLE_CHAR_BOUNDARY;
568     }
569 
570     /**
571      * Determines whether the code value is a surrogate.
572      *
573      * @param char16 The input character.
574      * @return true If the input character is a surrogate.
575      */
isSurrogate(char char16)576     public static boolean isSurrogate(char char16) {
577         return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
578     }
579 
580     /**
581      * Determines whether the character is a trail surrogate.
582      *
583      * @param char16 The input character.
584      * @return true If the input character is a trail surrogate.
585      */
isTrailSurrogate(char char16)586     public static boolean isTrailSurrogate(char char16) {
587         return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
588     }
589 
590     /**
591      * Determines whether the character is a lead surrogate.
592      *
593      * @param char16 The input character.
594      * @return true If the input character is a lead surrogate
595      */
isLeadSurrogate(char char16)596     public static boolean isLeadSurrogate(char char16) {
597         return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
598     }
599 
600     /**
601      * Returns the lead surrogate. If a validity check is required, use
602      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
603      * before calling.
604      *
605      * @param char32 The input character.
606      * @return lead surrogate if the getCharCount(ch) is 2; <br>
607      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
608      */
getLeadSurrogate(int char32)609     public static char getLeadSurrogate(int char32) {
610         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
611             return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
612         }
613         return 0;
614     }
615 
616     /**
617      * Returns the trail surrogate. If a validity check is required, use
618      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
619      * before calling.
620      *
621      * @param char32 The input character.
622      * @return the trail surrogate if the getCharCount(ch) is 2; <br>
623      *         otherwise the character itself
624      */
getTrailSurrogate(int char32)625     public static char getTrailSurrogate(int char32) {
626         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
627             return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
628         }
629         return (char) char32;
630     }
631 
632     /**
633      * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
634      * containing the UTF-32 value in UTF16 format. If a validity check is required, use
635      * {@link ohos.global.icu.lang.UCharacter#isLegal(int)} on char32 before calling.
636      *
637      * @param char32 The input character.
638      * @return string value of char32 in UTF16 format
639      * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint.
640      */
valueOf(int char32)641     public static String valueOf(int char32) {
642         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
643             throw new IllegalArgumentException("Illegal codepoint");
644         }
645         return toString(char32);
646     }
647 
648     /**
649      * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
650      * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
651      * character, the whole supplementary codepoint will be returned. If a validity check is
652      * required, use {@link ohos.global.icu.lang.UCharacter#isLegal(int)} on the
653      * codepoint at offset16 before calling. The result returned will be a newly created String
654      * obtained by calling source.substring(..) with the appropriate indexes.
655      *
656      * @param source The input string.
657      * @param offset16 The UTF16 index to the codepoint in source
658      * @return string value of char32 in UTF16 format
659      */
valueOf(String source, int offset16)660     public static String valueOf(String source, int offset16) {
661         switch (bounds(source, offset16)) {
662         case LEAD_SURROGATE_BOUNDARY:
663             return source.substring(offset16, offset16 + 2);
664         case TRAIL_SURROGATE_BOUNDARY:
665             return source.substring(offset16 - 1, offset16 + 1);
666         default:
667             return source.substring(offset16, offset16 + 1);
668         }
669     }
670 
671     /**
672      * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
673      * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
674      * surrogate character, the whole supplementary codepoint will be returned. If a validity check
675      * is required, use {@link ohos.global.icu.lang.UCharacter#isLegal(int)} on
676      * the codepoint at offset16 before calling. The result returned will be a newly created String
677      * obtained by calling source.substring(..) with the appropriate indexes.
678      *
679      * @param source The input string buffer.
680      * @param offset16 The UTF16 index to the codepoint in source
681      * @return string value of char32 in UTF16 format
682      */
valueOf(StringBuffer source, int offset16)683     public static String valueOf(StringBuffer source, int offset16) {
684         switch (bounds(source, offset16)) {
685         case LEAD_SURROGATE_BOUNDARY:
686             return source.substring(offset16, offset16 + 2);
687         case TRAIL_SURROGATE_BOUNDARY:
688             return source.substring(offset16 - 1, offset16 + 1);
689         default:
690             return source.substring(offset16, offset16 + 1);
691         }
692     }
693 
694     /**
695      * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
696      * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
697      * returned, except when either the leading or trailing surrogate character lies out of the
698      * specified subarray. In the latter case, only the surrogate character within bounds will be
699      * returned. If a validity check is required, use
700      * {@link ohos.global.icu.lang.UCharacter#isLegal(int)} on the codepoint at
701      * offset16 before calling. The result returned will be a newly created String containing the
702      * relevant characters.
703      *
704      * @param source The input char array.
705      * @param start Start index of the subarray
706      * @param limit End index of the subarray
707      * @param offset16 The UTF16 index to the codepoint in source relative to start
708      * @return string value of char32 in UTF16 format
709      */
valueOf(char source[], int start, int limit, int offset16)710     public static String valueOf(char source[], int start, int limit, int offset16) {
711         switch (bounds(source, start, limit, offset16)) {
712         case LEAD_SURROGATE_BOUNDARY:
713             return new String(source, start + offset16, 2);
714         case TRAIL_SURROGATE_BOUNDARY:
715             return new String(source, start + offset16 - 1, 2);
716         }
717         return new String(source, start + offset16, 1);
718     }
719 
720     /**
721      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
722      * the {@link UTF16 class description} for notes on roundtripping.
723      *
724      * @param source The UTF-16 string
725      * @param offset32 UTF-32 offset
726      * @return UTF-16 offset
727      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
728      */
findOffsetFromCodePoint(String source, int offset32)729     public static int findOffsetFromCodePoint(String source, int offset32) {
730         char ch;
731         int size = source.length(), result = 0, count = offset32;
732         if (offset32 < 0 || offset32 > size) {
733             throw new StringIndexOutOfBoundsException(offset32);
734         }
735         while (result < size && count > 0) {
736             ch = source.charAt(result);
737             if (isLeadSurrogate(ch) && ((result + 1) < size)
738                     && isTrailSurrogate(source.charAt(result + 1))) {
739                 result++;
740             }
741 
742             count--;
743             result++;
744         }
745         if (count != 0) {
746             throw new StringIndexOutOfBoundsException(offset32);
747         }
748         return result;
749     }
750 
751     /**
752      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
753      * the {@link UTF16 class description} for notes on roundtripping.
754      *
755      * @param source The UTF-16 string buffer
756      * @param offset32 UTF-32 offset
757      * @return UTF-16 offset
758      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
759      */
findOffsetFromCodePoint(StringBuffer source, int offset32)760     public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
761         char ch;
762         int size = source.length(), result = 0, count = offset32;
763         if (offset32 < 0 || offset32 > size) {
764             throw new StringIndexOutOfBoundsException(offset32);
765         }
766         while (result < size && count > 0) {
767             ch = source.charAt(result);
768             if (isLeadSurrogate(ch) && ((result + 1) < size)
769                     && isTrailSurrogate(source.charAt(result + 1))) {
770                 result++;
771             }
772 
773             count--;
774             result++;
775         }
776         if (count != 0) {
777             throw new StringIndexOutOfBoundsException(offset32);
778         }
779         return result;
780     }
781 
782     /**
783      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
784      * the {@link UTF16 class description} for notes on roundtripping.
785      *
786      * @param source The UTF-16 char array whose substring is to be analysed
787      * @param start Offset of the substring to be analysed
788      * @param limit Offset of the substring to be analysed
789      * @param offset32 UTF-32 offset relative to start
790      * @return UTF-16 offset relative to start
791      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
792      */
findOffsetFromCodePoint(char source[], int start, int limit, int offset32)793     public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
794         char ch;
795         int result = start, count = offset32;
796         if (offset32 > limit - start) {
797             throw new ArrayIndexOutOfBoundsException(offset32);
798         }
799         while (result < limit && count > 0) {
800             ch = source[result];
801             if (isLeadSurrogate(ch) && ((result + 1) < limit)
802                     && isTrailSurrogate(source[result + 1])) {
803                 result++;
804             }
805 
806             count--;
807             result++;
808         }
809         if (count != 0) {
810             throw new ArrayIndexOutOfBoundsException(offset32);
811         }
812         return result - start;
813     }
814 
815     /**
816      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
817      * UTF-16 offset. Used for random access. See the {@link UTF16 class description} for
818      * notes on roundtripping.<br>
819      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
820      * of the <strong>lead</strong> of the pair is returned. </i>
821      * <p>
822      * To find the UTF-32 length of a string, use:
823      *
824      * <pre>
825      * len32 = countCodePoint(source, source.length());
826      * </pre>
827      *
828      * @param source Text to analyse
829      * @param offset16 UTF-16 offset &lt; source text length.
830      * @return UTF-32 offset
831      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
832      */
findCodePointOffset(String source, int offset16)833     public static int findCodePointOffset(String source, int offset16) {
834         if (offset16 < 0 || offset16 > source.length()) {
835             throw new StringIndexOutOfBoundsException(offset16);
836         }
837 
838         int result = 0;
839         char ch;
840         boolean hadLeadSurrogate = false;
841 
842         for (int i = 0; i < offset16; ++i) {
843             ch = source.charAt(i);
844             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
845                 hadLeadSurrogate = false; // count valid trail as zero
846             } else {
847                 hadLeadSurrogate = isLeadSurrogate(ch);
848                 ++result; // count others as 1
849             }
850         }
851 
852         if (offset16 == source.length()) {
853             return result;
854         }
855 
856         // end of source being the less significant surrogate character
857         // shift result back to the start of the supplementary character
858         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
859             result--;
860         }
861 
862         return result;
863     }
864 
865     /**
866      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
867      * offset. Used for random access. See the {@link UTF16 class description} for notes on
868      * roundtripping.<br>
869      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
870      * of the <strong>lead</strong> of the pair is returned. </i>
871      * <p>
872      * To find the UTF-32 length of a string, use:
873      *
874      * <pre>
875      * len32 = countCodePoint(source);
876      * </pre>
877      *
878      * @param source Text to analyse
879      * @param offset16 UTF-16 offset &lt; source text length.
880      * @return UTF-32 offset
881      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
882      */
findCodePointOffset(StringBuffer source, int offset16)883     public static int findCodePointOffset(StringBuffer source, int offset16) {
884         if (offset16 < 0 || offset16 > source.length()) {
885             throw new StringIndexOutOfBoundsException(offset16);
886         }
887 
888         int result = 0;
889         char ch;
890         boolean hadLeadSurrogate = false;
891 
892         for (int i = 0; i < offset16; ++i) {
893             ch = source.charAt(i);
894             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
895                 hadLeadSurrogate = false; // count valid trail as zero
896             } else {
897                 hadLeadSurrogate = isLeadSurrogate(ch);
898                 ++result; // count others as 1
899             }
900         }
901 
902         if (offset16 == source.length()) {
903             return result;
904         }
905 
906         // end of source being the less significant surrogate character
907         // shift result back to the start of the supplementary character
908         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
909             result--;
910         }
911 
912         return result;
913     }
914 
915     /**
916      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
917      * offset. Used for random access. See the {@link UTF16 class description} for notes on
918      * roundtripping.<br>
919      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
920      * of the <strong>lead</strong> of the pair is returned. </i>
921      * <p>
922      * To find the UTF-32 length of a substring, use:
923      *
924      * <pre>
925      * len32 = countCodePoint(source, start, limit);
926      * </pre>
927      *
928      * @param source Text to analyse
929      * @param start Offset of the substring
930      * @param limit Offset of the substring
931      * @param offset16 UTF-16 relative to start
932      * @return UTF-32 offset relative to start
933      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
934      */
findCodePointOffset(char source[], int start, int limit, int offset16)935     public static int findCodePointOffset(char source[], int start, int limit, int offset16) {
936         offset16 += start;
937         if (offset16 > limit) {
938             throw new StringIndexOutOfBoundsException(offset16);
939         }
940 
941         int result = 0;
942         char ch;
943         boolean hadLeadSurrogate = false;
944 
945         for (int i = start; i < offset16; ++i) {
946             ch = source[i];
947             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
948                 hadLeadSurrogate = false; // count valid trail as zero
949             } else {
950                 hadLeadSurrogate = isLeadSurrogate(ch);
951                 ++result; // count others as 1
952             }
953         }
954 
955         if (offset16 == limit) {
956             return result;
957         }
958 
959         // end of source being the less significant surrogate character
960         // shift result back to the start of the supplementary character
961         if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
962             result--;
963         }
964 
965         return result;
966     }
967 
968     /**
969      * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required,
970      * use {@link ohos.global.icu.lang.UCharacter#isLegal(int)} on char32 before
971      * calling.
972      *
973      * @param target The buffer to append to
974      * @param char32 Value to append.
975      * @return the updated StringBuffer
976      * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints
977      */
append(StringBuffer target, int char32)978     public static StringBuffer append(StringBuffer target, int char32) {
979         // Check for irregular values
980         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
981             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
982         }
983 
984         // Write the UTF-16 values
985         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
986             target.append(getLeadSurrogate(char32));
987             target.append(getTrailSurrogate(char32));
988         } else {
989             target.append((char) char32);
990         }
991         return target;
992     }
993 
994     /**
995      * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
996      * convenience.
997      *
998      * @param target The buffer to append to
999      * @param cp The code point to append
1000      * @return the updated StringBuffer
1001      * @throws IllegalArgumentException If cp is not a valid code point
1002      */
appendCodePoint(StringBuffer target, int cp)1003     public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
1004         return append(target, cp);
1005     }
1006 
1007     /**
1008      * Adds a codepoint to offset16 position of the argument char array.
1009      *
1010      * @param target Char array to be append with the new code point
1011      * @param limit UTF16 offset which the codepoint will be appended.
1012      * @param char32 Code point to be appended
1013      * @return offset after char32 in the array.
1014      * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not
1015      *                lie within the range of the Unicode codepoints.
1016      */
append(char[] target, int limit, int char32)1017     public static int append(char[] target, int limit, int char32) {
1018         // Check for irregular values
1019         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1020             throw new IllegalArgumentException("Illegal codepoint");
1021         }
1022         // Write the UTF-16 values
1023         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1024             target[limit++] = getLeadSurrogate(char32);
1025             target[limit++] = getTrailSurrogate(char32);
1026         } else {
1027             target[limit++] = (char) char32;
1028         }
1029         return limit;
1030     }
1031 
1032     /**
1033      * Number of codepoints in a UTF16 String
1034      *
1035      * @param source UTF16 string
1036      * @return number of codepoint in string
1037      */
countCodePoint(String source)1038     public static int countCodePoint(String source) {
1039         if (source == null || source.length() == 0) {
1040             return 0;
1041         }
1042         return findCodePointOffset(source, source.length());
1043     }
1044 
1045     /**
1046      * Number of codepoints in a UTF16 String buffer
1047      *
1048      * @param source UTF16 string buffer
1049      * @return number of codepoint in string
1050      */
countCodePoint(StringBuffer source)1051     public static int countCodePoint(StringBuffer source) {
1052         if (source == null || source.length() == 0) {
1053             return 0;
1054         }
1055         return findCodePointOffset(source, source.length());
1056     }
1057 
1058     /**
1059      * Number of codepoints in a UTF16 char array substring
1060      *
1061      * @param source UTF16 char array
1062      * @param start Offset of the substring
1063      * @param limit Offset of the substring
1064      * @return number of codepoint in the substring
1065      * @exception IndexOutOfBoundsException If start and limit are not valid.
1066      */
countCodePoint(char source[], int start, int limit)1067     public static int countCodePoint(char source[], int start, int limit) {
1068         if (source == null || source.length == 0) {
1069             return 0;
1070         }
1071         return findCodePointOffset(source, start, limit, limit - start);
1072     }
1073 
1074     /**
1075      * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
1076      * non-supplementary codepoint with a supplementary and vice versa.
1077      *
1078      * @param target Stringbuffer
1079      * @param offset16 UTF16 position to insert into
1080      * @param char32 Code point
1081      */
setCharAt(StringBuffer target, int offset16, int char32)1082     public static void setCharAt(StringBuffer target, int offset16, int char32) {
1083         int count = 1;
1084         char single = target.charAt(offset16);
1085 
1086         if (isSurrogate(single)) {
1087             // pairs of the surrogate with offset16 at the lead char found
1088             if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
1089                     && isTrailSurrogate(target.charAt(offset16 + 1))) {
1090                 count++;
1091             } else {
1092                 // pairs of the surrogate with offset16 at the trail char
1093                 // found
1094                 if (isTrailSurrogate(single) && (offset16 > 0)
1095                         && isLeadSurrogate(target.charAt(offset16 - 1))) {
1096                     offset16--;
1097                     count++;
1098                 }
1099             }
1100         }
1101         target.replace(offset16, offset16 + count, valueOf(char32));
1102     }
1103 
1104     /**
1105      * Set a code point into a UTF16 position in a char array. Adjusts target according if we are
1106      * replacing a non-supplementary codepoint with a supplementary and vice versa.
1107      *
1108      * @param target char array
1109      * @param limit numbers of valid chars in target, different from target.length. limit counts the
1110      *            number of chars in target that represents a string, not the size of array target.
1111      * @param offset16 UTF16 position to insert into
1112      * @param char32 code point
1113      * @return new number of chars in target that represents a string
1114      * @exception IndexOutOfBoundsException if offset16 is out of range
1115      */
setCharAt(char target[], int limit, int offset16, int char32)1116     public static int setCharAt(char target[], int limit, int offset16, int char32) {
1117         if (offset16 >= limit) {
1118             throw new ArrayIndexOutOfBoundsException(offset16);
1119         }
1120         int count = 1;
1121         char single = target[offset16];
1122 
1123         if (isSurrogate(single)) {
1124             // pairs of the surrogate with offset16 at the lead char found
1125             if (isLeadSurrogate(single) && (target.length > offset16 + 1)
1126                     && isTrailSurrogate(target[offset16 + 1])) {
1127                 count++;
1128             } else {
1129                 // pairs of the surrogate with offset16 at the trail char
1130                 // found
1131                 if (isTrailSurrogate(single) && (offset16 > 0)
1132                         && isLeadSurrogate(target[offset16 - 1])) {
1133                     offset16--;
1134                     count++;
1135                 }
1136             }
1137         }
1138 
1139         String str = valueOf(char32);
1140         int result = limit;
1141         int strlength = str.length();
1142         target[offset16] = str.charAt(0);
1143         if (count == strlength) {
1144             if (count == 2) {
1145                 target[offset16 + 1] = str.charAt(1);
1146             }
1147         } else {
1148             // this is not exact match in space, we'll have to do some
1149             // shifting
1150             System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit
1151                     - (offset16 + count));
1152             if (count < strlength) {
1153                 // char32 is a supplementary character trying to squeeze into
1154                 // a non-supplementary space
1155                 target[offset16 + 1] = str.charAt(1);
1156                 result++;
1157                 if (result < target.length) {
1158                     target[result] = 0;
1159                 }
1160             } else {
1161                 // char32 is a non-supplementary character trying to fill
1162                 // into a supplementary space
1163                 result--;
1164                 target[result] = 0;
1165             }
1166         }
1167         return result;
1168     }
1169 
1170     /**
1171      * Shifts offset16 by the argument number of codepoints
1172      *
1173      * @param source string
1174      * @param offset16 UTF16 position to shift
1175      * @param shift32 number of codepoints to shift
1176      * @return new shifted offset16
1177      * @exception IndexOutOfBoundsException if the new offset16 is out of bounds.
1178      */
moveCodePointOffset(String source, int offset16, int shift32)1179     public static int moveCodePointOffset(String source, int offset16, int shift32) {
1180         int result = offset16;
1181         int size = source.length();
1182         int count;
1183         char ch;
1184         if (offset16 < 0 || offset16 > size) {
1185             throw new StringIndexOutOfBoundsException(offset16);
1186         }
1187         if (shift32 > 0) {
1188             if (shift32 + offset16 > size) {
1189                 throw new StringIndexOutOfBoundsException(offset16);
1190             }
1191             count = shift32;
1192             while (result < size && count > 0) {
1193                 ch = source.charAt(result);
1194                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1195                         && isTrailSurrogate(source.charAt(result + 1))) {
1196                     result++;
1197                 }
1198                 count--;
1199                 result++;
1200             }
1201         } else {
1202             if (offset16 + shift32 < 0) {
1203                 throw new StringIndexOutOfBoundsException(offset16);
1204             }
1205             for (count = -shift32; count > 0; count--) {
1206                 result--;
1207                 if (result < 0) {
1208                     break;
1209                 }
1210                 ch = source.charAt(result);
1211                 if (isTrailSurrogate(ch) && result > 0
1212                         && isLeadSurrogate(source.charAt(result - 1))) {
1213                     result--;
1214                 }
1215             }
1216         }
1217         if (count != 0) {
1218             throw new StringIndexOutOfBoundsException(shift32);
1219         }
1220         return result;
1221     }
1222 
1223     /**
1224      * Shifts offset16 by the argument number of codepoints
1225      *
1226      * @param source String buffer
1227      * @param offset16 UTF16 position to shift
1228      * @param shift32 Number of codepoints to shift
1229      * @return new shifted offset16
1230      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds.
1231      */
moveCodePointOffset(StringBuffer source, int offset16, int shift32)1232     public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) {
1233         int result = offset16;
1234         int size = source.length();
1235         int count;
1236         char ch;
1237         if (offset16 < 0 || offset16 > size) {
1238             throw new StringIndexOutOfBoundsException(offset16);
1239         }
1240         if (shift32 > 0) {
1241             if (shift32 + offset16 > size) {
1242                 throw new StringIndexOutOfBoundsException(offset16);
1243             }
1244             count = shift32;
1245             while (result < size && count > 0) {
1246                 ch = source.charAt(result);
1247                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1248                         && isTrailSurrogate(source.charAt(result + 1))) {
1249                     result++;
1250                 }
1251                 count--;
1252                 result++;
1253             }
1254         } else {
1255             if (offset16 + shift32 < 0) {
1256                 throw new StringIndexOutOfBoundsException(offset16);
1257             }
1258             for (count = -shift32; count > 0; count--) {
1259                 result--;
1260                 if (result < 0) {
1261                     break;
1262                 }
1263                 ch = source.charAt(result);
1264                 if (isTrailSurrogate(ch) && result > 0
1265                         && isLeadSurrogate(source.charAt(result - 1))) {
1266                     result--;
1267                 }
1268             }
1269         }
1270         if (count != 0) {
1271             throw new StringIndexOutOfBoundsException(shift32);
1272         }
1273         return result;
1274     }
1275 
1276     /**
1277      * Shifts offset16 by the argument number of codepoints within a subarray.
1278      *
1279      * @param source Char array
1280      * @param start Position of the subarray to be performed on
1281      * @param limit Position of the subarray to be performed on
1282      * @param offset16 UTF16 position to shift relative to start
1283      * @param shift32 Number of codepoints to shift
1284      * @return new shifted offset16 relative to start
1285      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the
1286      *                subarray bounds are out of range.
1287      */
moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32)1288     public static int moveCodePointOffset(char source[], int start, int limit, int offset16,
1289             int shift32) {
1290         int size = source.length;
1291         int count;
1292         char ch;
1293         int result = offset16 + start;
1294         if (start < 0 || limit < start) {
1295             throw new StringIndexOutOfBoundsException(start);
1296         }
1297         if (limit > size) {
1298             throw new StringIndexOutOfBoundsException(limit);
1299         }
1300         if (offset16 < 0 || result > limit) {
1301             throw new StringIndexOutOfBoundsException(offset16);
1302         }
1303         if (shift32 > 0) {
1304             if (shift32 + result > size) {
1305                 throw new StringIndexOutOfBoundsException(result);
1306             }
1307             count = shift32;
1308             while (result < limit && count > 0) {
1309                 ch = source[result];
1310                 if (isLeadSurrogate(ch) && (result + 1 < limit)
1311                         && isTrailSurrogate(source[result + 1])) {
1312                     result++;
1313                 }
1314                 count--;
1315                 result++;
1316             }
1317         } else {
1318             if (result + shift32 < start) {
1319                 throw new StringIndexOutOfBoundsException(result);
1320             }
1321             for (count = -shift32; count > 0; count--) {
1322                 result--;
1323                 if (result < start) {
1324                     break;
1325                 }
1326                 ch = source[result];
1327                 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
1328                     result--;
1329                 }
1330             }
1331         }
1332         if (count != 0) {
1333             throw new StringIndexOutOfBoundsException(shift32);
1334         }
1335         result -= start;
1336         return result;
1337     }
1338 
1339     /**
1340      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1341      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1342      * codepoint. The length of target increases by one if codepoint is non-supplementary, 2
1343      * otherwise.
1344      * <p>
1345      * The overall effect is exactly as if the argument were converted to a string by the method
1346      * valueOf(char) and the characters in that string were then inserted into target at the
1347      * position indicated by offset16.
1348      * </p>
1349      * <p>
1350      * The offset argument must be greater than or equal to 0, and less than or equal to the length
1351      * of source.
1352      *
1353      * @param target String buffer to insert to
1354      * @param offset16 Offset which char32 will be inserted in
1355      * @param char32 Codepoint to be inserted
1356      * @return a reference to target
1357      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1358      */
insert(StringBuffer target, int offset16, int char32)1359     public static StringBuffer insert(StringBuffer target, int offset16, int char32) {
1360         String str = valueOf(char32);
1361         if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1362             offset16++;
1363         }
1364         target.insert(offset16, str);
1365         return target;
1366     }
1367 
1368     /**
1369      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1370      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1371      * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise.
1372      * <p>
1373      * The overall effect is exactly as if the argument were converted to a string by the method
1374      * valueOf(char) and the characters in that string were then inserted into target at the
1375      * position indicated by offset16.
1376      * </p>
1377      * <p>
1378      * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
1379      *
1380      * @param target Char array to insert to
1381      * @param limit End index of the char array, limit &lt;= target.length
1382      * @param offset16 Offset which char32 will be inserted in
1383      * @param char32 Codepoint to be inserted
1384      * @return new limit size
1385      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1386      */
insert(char target[], int limit, int offset16, int char32)1387     public static int insert(char target[], int limit, int offset16, int char32) {
1388         String str = valueOf(char32);
1389         if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1390             offset16++;
1391         }
1392         int size = str.length();
1393         if (limit + size > target.length) {
1394             throw new ArrayIndexOutOfBoundsException(offset16 + size);
1395         }
1396         System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
1397         target[offset16] = str.charAt(0);
1398         if (size == 2) {
1399             target[offset16 + 1] = str.charAt(1);
1400         }
1401         return limit + size;
1402     }
1403 
1404     /**
1405      * Removes the codepoint at the specified position in this target (shortening target by 1
1406      * character if the codepoint is a non-supplementary, 2 otherwise).
1407      *
1408      * @param target String buffer to remove codepoint from
1409      * @param offset16 Offset which the codepoint will be removed
1410      * @return a reference to target
1411      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1412      */
delete(StringBuffer target, int offset16)1413     public static StringBuffer delete(StringBuffer target, int offset16) {
1414         int count = 1;
1415         switch (bounds(target, offset16)) {
1416         case LEAD_SURROGATE_BOUNDARY:
1417             count++;
1418             break;
1419         case TRAIL_SURROGATE_BOUNDARY:
1420             count++;
1421             offset16--;
1422             break;
1423         }
1424         target.delete(offset16, offset16 + count);
1425         return target;
1426     }
1427 
1428     /**
1429      * Removes the codepoint at the specified position in this target (shortening target by 1
1430      * character if the codepoint is a non-supplementary, 2 otherwise).
1431      *
1432      * @param target String buffer to remove codepoint from
1433      * @param limit End index of the char array, limit &lt;= target.length
1434      * @param offset16 Offset which the codepoint will be removed
1435      * @return a new limit size
1436      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1437      */
delete(char target[], int limit, int offset16)1438     public static int delete(char target[], int limit, int offset16) {
1439         int count = 1;
1440         switch (bounds(target, 0, limit, offset16)) {
1441         case LEAD_SURROGATE_BOUNDARY:
1442             count++;
1443             break;
1444         case TRAIL_SURROGATE_BOUNDARY:
1445             count++;
1446             offset16--;
1447             break;
1448         }
1449         System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
1450         target[limit - count] = 0;
1451         return limit - count;
1452     }
1453 
1454     /**
1455      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1456      * the argument codepoint. I.e., the smallest index <code>i</code> such that
1457      * <code>UTF16.charAt(source, i) ==
1458      * char32</code> is true.
1459      * <p>
1460      * If no such character occurs in this string, then -1 is returned.
1461      * </p>
1462      * <p>
1463      * Examples:<br>
1464      * UTF16.indexOf("abc", 'a') returns 0<br>
1465      * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1466      * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1467      * </p>
1468      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1469      * characters to its fullest.
1470      *
1471      * @param source UTF16 format Unicode string that will be searched
1472      * @param char32 Codepoint to search for
1473      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1474      *         -1 if the codepoint does not occur.
1475      */
indexOf(String source, int char32)1476     public static int indexOf(String source, int char32) {
1477         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1478             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1479         }
1480         // non-surrogate bmp
1481         if (char32 < LEAD_SURROGATE_MIN_VALUE
1482                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1483             return source.indexOf((char) char32);
1484         }
1485         // surrogate
1486         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1487             int result = source.indexOf((char) char32);
1488             if (result >= 0) {
1489                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1490                         && isTrailSurrogate(source.charAt(result + 1))) {
1491                     return indexOf(source, char32, result + 1);
1492                 }
1493                 // trail surrogate
1494                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1495                     return indexOf(source, char32, result + 1);
1496                 }
1497             }
1498             return result;
1499         }
1500         // supplementary
1501         String char32str = toString(char32);
1502         return source.indexOf(char32str);
1503     }
1504 
1505     /**
1506      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1507      * the argument string str. This method is implemented based on codepoints, hence a "lead
1508      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1509      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1510      * character before str found at in source will not have a valid match. Vice versa for lead
1511      * surrogates that ends str. See example below.
1512      * <p>
1513      * If no such string str occurs in this source, then -1 is returned.
1514      * </p>
1515      * <p>
1516      * Examples:<br>
1517      * UTF16.indexOf("abc", "ab") returns 0<br>
1518      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1519      * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1520      * </p>
1521      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1522      * characters to its fullest.
1523      *
1524      * @param source UTF16 format Unicode string that will be searched
1525      * @param str UTF16 format Unicode string to search for
1526      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1527      *         -1 if the codepoint does not occur.
1528      */
indexOf(String source, String str)1529     public static int indexOf(String source, String str) {
1530         int strLength = str.length();
1531         // non-surrogate ends
1532         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1533             return source.indexOf(str);
1534         }
1535 
1536         int result = source.indexOf(str);
1537         int resultEnd = result + strLength;
1538         if (result >= 0) {
1539             // check last character
1540             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1541                     && isTrailSurrogate(source.charAt(resultEnd + 1))) {
1542                 return indexOf(source, str, resultEnd + 1);
1543             }
1544             // check first character which is a trail surrogate
1545             if (isTrailSurrogate(str.charAt(0)) && result > 0
1546                     && isLeadSurrogate(source.charAt(result - 1))) {
1547                 return indexOf(source, str, resultEnd + 1);
1548             }
1549         }
1550         return result;
1551     }
1552 
1553     /**
1554      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1555      * the argument codepoint. I.e., the smallest index i such that: <br>
1556      * (UTF16.charAt(source, i) == char32 &amp;&amp; i &gt;= fromIndex) is true.
1557      * <p>
1558      * If no such character occurs in this string, then -1 is returned.
1559      * </p>
1560      * <p>
1561      * Examples:<br>
1562      * UTF16.indexOf("abc", 'a', 1) returns -1<br>
1563      * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
1564      * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
1565      * </p>
1566      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1567      * characters to its fullest.
1568      *
1569      * @param source UTF16 format Unicode string that will be searched
1570      * @param char32 Codepoint to search for
1571      * @param fromIndex The index to start the search from.
1572      * @return the index of the first occurrence of the codepoint in the argument Unicode string at
1573      *         or after fromIndex, or -1 if the codepoint does not occur.
1574      */
indexOf(String source, int char32, int fromIndex)1575     public static int indexOf(String source, int char32, int fromIndex) {
1576         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1577             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1578         }
1579         // non-surrogate bmp
1580         if (char32 < LEAD_SURROGATE_MIN_VALUE
1581                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1582             return source.indexOf((char) char32, fromIndex);
1583         }
1584         // surrogate
1585         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1586             int result = source.indexOf((char) char32, fromIndex);
1587             if (result >= 0) {
1588                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1589                         && isTrailSurrogate(source.charAt(result + 1))) {
1590                     return indexOf(source, char32, result + 1);
1591                 }
1592                 // trail surrogate
1593                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1594                     return indexOf(source, char32, result + 1);
1595                 }
1596             }
1597             return result;
1598         }
1599         // supplementary
1600         String char32str = toString(char32);
1601         return source.indexOf(char32str, fromIndex);
1602     }
1603 
1604     /**
1605      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1606      * the argument string str. This method is implemented based on codepoints, hence a "lead
1607      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1608      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1609      * character before str found at in source will not have a valid match. Vice versa for lead
1610      * surrogates that ends str. See example below.
1611      * <p>
1612      * If no such string str occurs in this source, then -1 is returned.
1613      * </p>
1614      * <p>
1615      * Examples:<br>
1616      * UTF16.indexOf("abc", "ab", 0) returns 0<br>
1617      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
1618      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
1619      * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
1620      * </p>
1621      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1622      * characters to its fullest.
1623      *
1624      * @param source UTF16 format Unicode string that will be searched
1625      * @param str UTF16 format Unicode string to search for
1626      * @param fromIndex The index to start the search from.
1627      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1628      *         -1 if the codepoint does not occur.
1629      */
indexOf(String source, String str, int fromIndex)1630     public static int indexOf(String source, String str, int fromIndex) {
1631         int strLength = str.length();
1632         // non-surrogate ends
1633         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1634             return source.indexOf(str, fromIndex);
1635         }
1636 
1637         int result = source.indexOf(str, fromIndex);
1638         int resultEnd = result + strLength;
1639         if (result >= 0) {
1640             // check last character
1641             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1642                     && isTrailSurrogate(source.charAt(resultEnd))) {
1643                 return indexOf(source, str, resultEnd + 1);
1644             }
1645             // check first character which is a trail surrogate
1646             if (isTrailSurrogate(str.charAt(0)) && result > 0
1647                     && isLeadSurrogate(source.charAt(result - 1))) {
1648                 return indexOf(source, str, resultEnd + 1);
1649             }
1650         }
1651         return result;
1652     }
1653 
1654     /**
1655      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1656      * the argument codepoint. I.e., the index returned is the largest value i such that:
1657      * UTF16.charAt(source, i) == char32 is true.
1658      * <p>
1659      * Examples:<br>
1660      * UTF16.lastIndexOf("abc", 'a') returns 0<br>
1661      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1662      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1663      * </p>
1664      * <p>
1665      * source is searched backwards starting at the last character.
1666      * </p>
1667      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1668      * characters to its fullest.
1669      *
1670      * @param source UTF16 format Unicode string that will be searched
1671      * @param char32 Codepoint to search for
1672      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1673      *         does not occur.
1674      */
lastIndexOf(String source, int char32)1675     public static int lastIndexOf(String source, int char32) {
1676         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1677             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1678         }
1679         // non-surrogate bmp
1680         if (char32 < LEAD_SURROGATE_MIN_VALUE
1681                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1682             return source.lastIndexOf((char) char32);
1683         }
1684         // surrogate
1685         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1686             int result = source.lastIndexOf((char) char32);
1687             if (result >= 0) {
1688                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1689                         && isTrailSurrogate(source.charAt(result + 1))) {
1690                     return lastIndexOf(source, char32, result - 1);
1691                 }
1692                 // trail surrogate
1693                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1694                     return lastIndexOf(source, char32, result - 1);
1695                 }
1696             }
1697             return result;
1698         }
1699         // supplementary
1700         String char32str = toString(char32);
1701         return source.lastIndexOf(char32str);
1702     }
1703 
1704     /**
1705      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1706      * the argument string str. This method is implemented based on codepoints, hence a "lead
1707      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1708      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1709      * character before str found at in source will not have a valid match. Vice versa for lead
1710      * surrogates that ends str. See example below.
1711      * <p>
1712      * Examples:<br>
1713      * UTF16.lastIndexOf("abc", "a") returns 0<br>
1714      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1715      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1716      * </p>
1717      * <p>
1718      * source is searched backwards starting at the last character.
1719      * </p>
1720      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1721      * characters to its fullest.
1722      *
1723      * @param source UTF16 format Unicode string that will be searched
1724      * @param str UTF16 format Unicode string to search for
1725      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1726      *         does not occur.
1727      */
lastIndexOf(String source, String str)1728     public static int lastIndexOf(String source, String str) {
1729         int strLength = str.length();
1730         // non-surrogate ends
1731         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1732             return source.lastIndexOf(str);
1733         }
1734 
1735         int result = source.lastIndexOf(str);
1736         if (result >= 0) {
1737             // check last character
1738             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1739                     && isTrailSurrogate(source.charAt(result + strLength + 1))) {
1740                 return lastIndexOf(source, str, result - 1);
1741             }
1742             // check first character which is a trail surrogate
1743             if (isTrailSurrogate(str.charAt(0)) && result > 0
1744                     && isLeadSurrogate(source.charAt(result - 1))) {
1745                 return lastIndexOf(source, str, result - 1);
1746             }
1747         }
1748         return result;
1749     }
1750 
1751     /**
1752      * <p>
1753      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1754      * the argument codepoint, where the result is less than or equals to fromIndex.
1755      * </p>
1756      * <p>
1757      * This method is implemented based on codepoints, hence a single surrogate character will not
1758      * match a supplementary character.
1759      * </p>
1760      * <p>
1761      * source is searched backwards starting at the last character starting at the specified index.
1762      * </p>
1763      * <p>
1764      * Examples:<br>
1765      * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
1766      * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
1767      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
1768      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
1769      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1770      * </p>
1771      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1772      * characters to its fullest.
1773      *
1774      * @param source UTF16 format Unicode string that will be searched
1775      * @param char32 Codepoint to search for
1776      * @param fromIndex the index to start the search from. There is no restriction on the value of
1777      *            fromIndex. If it is greater than or equal to the length of this string, it has the
1778      *            same effect as if it were equal to one less than the length of this string: this
1779      *            entire string may be searched. If it is negative, it has the same effect as if it
1780      *            were -1: -1 is returned.
1781      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1782      *         does not occur.
1783      */
lastIndexOf(String source, int char32, int fromIndex)1784     public static int lastIndexOf(String source, int char32, int fromIndex) {
1785         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1786             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1787         }
1788         // non-surrogate bmp
1789         if (char32 < LEAD_SURROGATE_MIN_VALUE
1790                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1791             return source.lastIndexOf((char) char32, fromIndex);
1792         }
1793         // surrogate
1794         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1795             int result = source.lastIndexOf((char) char32, fromIndex);
1796             if (result >= 0) {
1797                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1798                         && isTrailSurrogate(source.charAt(result + 1))) {
1799                     return lastIndexOf(source, char32, result - 1);
1800                 }
1801                 // trail surrogate
1802                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1803                     return lastIndexOf(source, char32, result - 1);
1804                 }
1805             }
1806             return result;
1807         }
1808         // supplementary
1809         String char32str = toString(char32);
1810         return source.lastIndexOf(char32str, fromIndex);
1811     }
1812 
1813     /**
1814      * <p>
1815      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1816      * the argument string str, where the result is less than or equals to fromIndex.
1817      * </p>
1818      * <p>
1819      * This method is implemented based on codepoints, hence a "lead surrogate character + trail
1820      * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate
1821      * character at index 0, a source with a leading a surrogate character before str found at in
1822      * source will not have a valid match. Vice versa for lead surrogates that ends str.
1823      * </p>
1824      * See example below.
1825      * <p>
1826      * Examples:<br>
1827      * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
1828      * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
1829      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
1830      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
1831      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
1832      * </p>
1833      * <p>
1834      * source is searched backwards starting at the last character.
1835      * </p>
1836      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1837      * characters to its fullest.
1838      *
1839      * @param source UTF16 format Unicode string that will be searched
1840      * @param str UTF16 format Unicode string to search for
1841      * @param fromIndex the index to start the search from. There is no restriction on the value of
1842      *            fromIndex. If it is greater than or equal to the length of this string, it has the
1843      *            same effect as if it were equal to one less than the length of this string: this
1844      *            entire string may be searched. If it is negative, it has the same effect as if it
1845      *            were -1: -1 is returned.
1846      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1847      *         does not occur.
1848      */
lastIndexOf(String source, String str, int fromIndex)1849     public static int lastIndexOf(String source, String str, int fromIndex) {
1850         int strLength = str.length();
1851         // non-surrogate ends
1852         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1853             return source.lastIndexOf(str, fromIndex);
1854         }
1855 
1856         int result = source.lastIndexOf(str, fromIndex);
1857         if (result >= 0) {
1858             // check last character
1859             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1860                     && isTrailSurrogate(source.charAt(result + strLength))) {
1861                 return lastIndexOf(source, str, result - 1);
1862             }
1863             // check first character which is a trail surrogate
1864             if (isTrailSurrogate(str.charAt(0)) && result > 0
1865                     && isLeadSurrogate(source.charAt(result - 1))) {
1866                 return lastIndexOf(source, str, result - 1);
1867             }
1868         }
1869         return result;
1870     }
1871 
1872     /**
1873      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of
1874      * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16
1875      * format Unicode string source, then source will be returned. Otherwise, a new String object is
1876      * created that represents a codepoint sequence identical to the codepoint sequence represented
1877      * by source, except that every occurrence of oldChar32 is replaced by an occurrence of
1878      * newChar32.
1879      * <p>
1880      * Examples: <br>
1881      * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
1882      * returns "mosquito in your collar"<br>
1883      * UTF16.replace("JonL", 'q', 'x');<br>
1884      * returns "JonL" (no change)<br>
1885      * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br>
1886      * returns "Supplementary character !"<br>
1887      * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br>
1888      * returns "Supplementary character \ud800\udc00"<br>
1889      * </p>
1890      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1891      * characters to its fullest.
1892      *
1893      * @param source UTF16 format Unicode string which the codepoint replacements will be based on.
1894      * @param oldChar32 Non-zero old codepoint to be replaced.
1895      * @param newChar32 The new codepoint to replace oldChar32
1896      * @return new String derived from source by replacing every occurrence of oldChar32 with
1897      *         newChar32, unless when no oldChar32 is found in source then source will be returned.
1898      */
replace(String source, int oldChar32, int newChar32)1899     public static String replace(String source, int oldChar32, int newChar32) {
1900         if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
1901             throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint");
1902         }
1903         if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
1904             throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint");
1905         }
1906 
1907         int index = indexOf(source, oldChar32);
1908         if (index == -1) {
1909             return source;
1910         }
1911         String newChar32Str = toString(newChar32);
1912         int oldChar32Size = 1;
1913         int newChar32Size = newChar32Str.length();
1914         StringBuffer result = new StringBuffer(source);
1915         int resultIndex = index;
1916 
1917         if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
1918             oldChar32Size = 2;
1919         }
1920 
1921         while (index != -1) {
1922             int endResultIndex = resultIndex + oldChar32Size;
1923             result.replace(resultIndex, endResultIndex, newChar32Str);
1924             int lastEndIndex = index + oldChar32Size;
1925             index = indexOf(source, oldChar32, lastEndIndex);
1926             resultIndex += newChar32Size + index - lastEndIndex;
1927         }
1928         return result.toString();
1929     }
1930 
1931     /**
1932      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr
1933      * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string
1934      * source, then source will be returned. Otherwise, a new String object is created that
1935      * represents a codepoint sequence identical to the codepoint sequence represented by source,
1936      * except that every occurrence of oldStr is replaced by an occurrence of newStr.
1937      * <p>
1938      * Examples: <br>
1939      * UTF16.replace("mesquite in your cellar", "e", "o");<br>
1940      * returns "mosquito in your collar"<br>
1941      * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
1942      * returns "cat in your cellar"<br>
1943      * UTF16.replace("JonL", "q", "x");<br>
1944      * returns "JonL" (no change)<br>
1945      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br>
1946      * returns "Supplementary character !"<br>
1947      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br>
1948      * returns "Supplementary character \ud800\udc00"<br>
1949      * </p>
1950      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1951      * characters to its fullest.
1952      *
1953      * @param source UTF16 format Unicode string which the replacements will be based on.
1954      * @param oldStr Non-zero-length string to be replaced.
1955      * @param newStr The new string to replace oldStr
1956      * @return new String derived from source by replacing every occurrence of oldStr with newStr.
1957      *         When no oldStr is found in source, then source will be returned.
1958      */
replace(String source, String oldStr, String newStr)1959     public static String replace(String source, String oldStr, String newStr) {
1960         int index = indexOf(source, oldStr);
1961         if (index == -1) {
1962             return source;
1963         }
1964         int oldStrSize = oldStr.length();
1965         int newStrSize = newStr.length();
1966         StringBuffer result = new StringBuffer(source);
1967         int resultIndex = index;
1968 
1969         while (index != -1) {
1970             int endResultIndex = resultIndex + oldStrSize;
1971             result.replace(resultIndex, endResultIndex, newStr);
1972             int lastEndIndex = index + oldStrSize;
1973             index = indexOf(source, oldStr, lastEndIndex);
1974             resultIndex += newStrSize + index - lastEndIndex;
1975         }
1976         return result.toString();
1977     }
1978 
1979     /**
1980      * Reverses a UTF16 format Unicode string and replaces source's content with it. This method
1981      * will reverse surrogate characters correctly, instead of blindly reversing every character.
1982      * <p>
1983      * Examples:<br>
1984      * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
1985      * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
1986      *
1987      * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed
1988      * @return a modified source with reversed UTF16 format Unicode string.
1989      */
reverse(StringBuffer source)1990     public static StringBuffer reverse(StringBuffer source) {
1991         int length = source.length();
1992         StringBuffer result = new StringBuffer(length);
1993         for (int i = length; i-- > 0;) {
1994             char ch = source.charAt(i);
1995             if (isTrailSurrogate(ch) && i > 0) {
1996                 char ch2 = source.charAt(i - 1);
1997                 if (isLeadSurrogate(ch2)) {
1998                     result.append(ch2);
1999                     result.append(ch);
2000                     --i;
2001                     continue;
2002                 }
2003             }
2004             result.append(ch);
2005         }
2006         return result;
2007     }
2008 
2009     /**
2010      * Check if the string contains more Unicode code points than a certain number. This is more
2011      * efficient than counting all code points in the entire string and comparing that number with a
2012      * threshold. This function may not need to scan the string at all if the length is within a
2013      * certain range, and never needs to count more than 'number + 1' code points. Logically
2014      * equivalent to (countCodePoint(s) &gt; number). A Unicode code point may occupy either one or two
2015      * code units.
2016      *
2017      * @param source The input string.
2018      * @param number The number of code points in the string is compared against the 'number'
2019      *            parameter.
2020      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2021      */
hasMoreCodePointsThan(String source, int number)2022     public static boolean hasMoreCodePointsThan(String source, int number) {
2023         if (number < 0) {
2024             return true;
2025         }
2026         if (source == null) {
2027             return false;
2028         }
2029         int length = source.length();
2030 
2031         // length >= 0 known
2032         // source contains at least (length + 1) / 2 code points: <= 2
2033         // chars per cp
2034         if (((length + 1) >> 1) > number) {
2035             return true;
2036         }
2037 
2038         // check if source does not even contain enough chars
2039         int maxsupplementary = length - number;
2040         if (maxsupplementary <= 0) {
2041             return false;
2042         }
2043 
2044         // there are maxsupplementary = length - number more chars than
2045         // asked-for code points
2046 
2047         // count code points until they exceed and also check that there are
2048         // no more than maxsupplementary supplementary code points (char pairs)
2049         int start = 0;
2050         while (true) {
2051             if (length == 0) {
2052                 return false;
2053             }
2054             if (number == 0) {
2055                 return true;
2056             }
2057             if (isLeadSurrogate(source.charAt(start++)) && start != length
2058                     && isTrailSurrogate(source.charAt(start))) {
2059                 start++;
2060                 if (--maxsupplementary <= 0) {
2061                     // too many pairs - too few code points
2062                     return false;
2063                 }
2064             }
2065             --number;
2066         }
2067     }
2068 
2069     /**
2070      * Check if the sub-range of char array, from argument start to limit, contains more Unicode
2071      * code points than a certain number. This is more efficient than counting all code points in
2072      * the entire char array range and comparing that number with a threshold. This function may not
2073      * need to scan the char array at all if start and limit is within a certain range, and never
2074      * needs to count more than 'number + 1' code points. Logically equivalent to
2075      * (countCodePoint(source, start, limit) &gt; number). A Unicode code point may occupy either one
2076      * or two code units.
2077      *
2078      * @param source Array of UTF-16 chars
2079      * @param start Offset to substring in the source array for analyzing
2080      * @param limit Offset to substring in the source array for analyzing
2081      * @param number The number of code points in the string is compared against the 'number'
2082      *            parameter.
2083      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2084      * @exception IndexOutOfBoundsException Thrown when limit &lt; start
2085      */
hasMoreCodePointsThan(char source[], int start, int limit, int number)2086     public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
2087         int length = limit - start;
2088         if (length < 0 || start < 0 || limit < 0) {
2089             throw new IndexOutOfBoundsException(
2090                     "Start and limit indexes should be non-negative and start <= limit");
2091         }
2092         if (number < 0) {
2093             return true;
2094         }
2095         if (source == null) {
2096             return false;
2097         }
2098 
2099         // length >= 0 known
2100         // source contains at least (length + 1) / 2 code points: <= 2
2101         // chars per cp
2102         if (((length + 1) >> 1) > number) {
2103             return true;
2104         }
2105 
2106         // check if source does not even contain enough chars
2107         int maxsupplementary = length - number;
2108         if (maxsupplementary <= 0) {
2109             return false;
2110         }
2111 
2112         // there are maxsupplementary = length - number more chars than
2113         // asked-for code points
2114 
2115         // count code points until they exceed and also check that there are
2116         // no more than maxsupplementary supplementary code points (char pairs)
2117         while (true) {
2118             if (length == 0) {
2119                 return false;
2120             }
2121             if (number == 0) {
2122                 return true;
2123             }
2124             if (isLeadSurrogate(source[start++]) && start != limit
2125                     && isTrailSurrogate(source[start])) {
2126                 start++;
2127                 if (--maxsupplementary <= 0) {
2128                     // too many pairs - too few code points
2129                     return false;
2130                 }
2131             }
2132             --number;
2133         }
2134     }
2135 
2136     /**
2137      * Check if the string buffer contains more Unicode code points than a certain number. This is
2138      * more efficient than counting all code points in the entire string buffer and comparing that
2139      * number with a threshold. This function may not need to scan the string buffer at all if the
2140      * length is within a certain range, and never needs to count more than 'number + 1' code
2141      * points. Logically equivalent to (countCodePoint(s) &gt; number). A Unicode code point may
2142      * occupy either one or two code units.
2143      *
2144      * @param source The input string buffer.
2145      * @param number The number of code points in the string buffer is compared against the 'number'
2146      *            parameter.
2147      * @return boolean value for whether the string buffer contains more Unicode code points than
2148      *         'number'.
2149      */
hasMoreCodePointsThan(StringBuffer source, int number)2150     public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
2151         if (number < 0) {
2152             return true;
2153         }
2154         if (source == null) {
2155             return false;
2156         }
2157         int length = source.length();
2158 
2159         // length >= 0 known
2160         // source contains at least (length + 1) / 2 code points: <= 2
2161         // chars per cp
2162         if (((length + 1) >> 1) > number) {
2163             return true;
2164         }
2165 
2166         // check if source does not even contain enough chars
2167         int maxsupplementary = length - number;
2168         if (maxsupplementary <= 0) {
2169             return false;
2170         }
2171 
2172         // there are maxsupplementary = length - number more chars than
2173         // asked-for code points
2174 
2175         // count code points until they exceed and also check that there are
2176         // no more than maxsupplementary supplementary code points (char pairs)
2177         int start = 0;
2178         while (true) {
2179             if (length == 0) {
2180                 return false;
2181             }
2182             if (number == 0) {
2183                 return true;
2184             }
2185             if (isLeadSurrogate(source.charAt(start++)) && start != length
2186                     && isTrailSurrogate(source.charAt(start))) {
2187                 start++;
2188                 if (--maxsupplementary <= 0) {
2189                     // too many pairs - too few code points
2190                     return false;
2191                 }
2192             }
2193             --number;
2194         }
2195     }
2196 
2197     /**
2198      * Cover JDK 1.5 API. Create a String from an array of codePoints.
2199      *
2200      * @param codePoints The code array
2201      * @param offset The start of the text in the code point array
2202      * @param count The number of code points
2203      * @return a String representing the code points between offset and count
2204      * @throws IllegalArgumentException If an invalid code point is encountered
2205      * @throws IndexOutOfBoundsException If the offset or count are out of bounds.
2206      */
newString(int[] codePoints, int offset, int count)2207     public static String newString(int[] codePoints, int offset, int count) {
2208         if (count < 0) {
2209             throw new IllegalArgumentException();
2210         }
2211         char[] chars = new char[count];
2212         int w = 0;
2213         for (int r = offset, e = offset + count; r < e; ++r) {
2214             int cp = codePoints[r];
2215             if (cp < 0 || cp > 0x10ffff) {
2216                 throw new IllegalArgumentException();
2217             }
2218             while (true) {
2219                 try {
2220                     if (cp < 0x010000) {
2221                         chars[w] = (char) cp;
2222                         w++;
2223                     } else {
2224                         chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
2225                         chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
2226                         w += 2;
2227                     }
2228                     break;
2229                 } catch (IndexOutOfBoundsException ex) {
2230                     int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
2231                             / (r - offset + 1)));
2232                     char[] temp = new char[newlen];
2233                     System.arraycopy(chars, 0, temp, 0, w);
2234                     chars = temp;
2235                 }
2236             }
2237         }
2238         return new String(chars, 0, w);
2239     }
2240 
2241     /**
2242      * <p>
2243      * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various
2244      * modes
2245      * </p>
2246      * <ul>
2247      * <li> Code point comparison or code unit comparison
2248      * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison
2249      * with special handling for character 'i'.
2250      * </ul>
2251      * <p>
2252      * The code unit or code point comparison differ only when comparing supplementary code points
2253      * (&#92;u10000..&#92;u10ffff) to BMP code points near the end of the BMP (i.e.,
2254      * &#92;ue000..&#92;uffff). In code unit comparison, high BMP code points sort after
2255      * supplementary code points because they are stored as pairs of surrogates which are at
2256      * &#92;ud800..&#92;udfff.
2257      * </p>
2258      *
2259      * @see #FOLD_CASE_DEFAULT
2260      * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2261      * @hide exposed on OHOS
2262      */
2263     public static final class StringComparator implements java.util.Comparator<String> {
2264         // public constructor ------------------------------------------------
2265 
2266         /**
2267          * Default constructor that does code unit comparison and case sensitive comparison.
2268          */
StringComparator()2269         public StringComparator() {
2270             this(false, false, FOLD_CASE_DEFAULT);
2271         }
2272 
2273         /**
2274          * Constructor that does comparison based on the argument options.
2275          *
2276          * @param codepointcompare Flag to indicate true for code point comparison or false for code unit
2277          *            comparison.
2278          * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison
2279          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2280          *            when ignorecase is set to true. If ignorecase is false, this option is
2281          *            ignored.
2282          * @see #FOLD_CASE_DEFAULT
2283          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2284          * @throws IllegalArgumentException If foldcaseoption is out of range
2285          */
StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption)2286         public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) {
2287             setCodePointCompare(codepointcompare);
2288             m_ignoreCase_ = ignorecase;
2289             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2290                 throw new IllegalArgumentException("Invalid fold case option");
2291             }
2292             m_foldCase_ = foldcaseoption;
2293         }
2294 
2295         // public data member ------------------------------------------------
2296 
2297         /**
2298          * Option value for case folding comparison:
2299          *
2300          * <p>Comparison is case insensitive, strings are folded using default mappings defined in
2301          * Unicode data file CaseFolding.txt, before comparison.
2302          */
2303         public static final int FOLD_CASE_DEFAULT = 0;
2304 
2305         /**
2306          * Option value for case folding:
2307          * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
2308          * and dotless i appropriately for Turkic languages (tr, az).
2309          *
2310          * <p>Comparison is case insensitive, strings are folded using modified mappings defined in
2311          * Unicode data file CaseFolding.txt, before comparison.
2312          *
2313          * @see ohos.global.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
2314          */
2315         public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
2316 
2317         // public methods ----------------------------------------------------
2318 
2319         // public setters ----------------------------------------------------
2320 
2321         /**
2322          * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode
2323          * is set to code unit compare
2324          *
2325          * @param flag True for code point compare, false for code unit compare
2326          */
setCodePointCompare(boolean flag)2327         public void setCodePointCompare(boolean flag) {
2328             if (flag) {
2329                 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
2330             } else {
2331                 m_codePointCompare_ = 0;
2332             }
2333         }
2334 
2335         /**
2336          * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise
2337          * case sensitive comparison mode if set to false.
2338          *
2339          * @param ignorecase True for case-insitive comparison, false for case sensitive comparison
2340          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2341          *            when ignorecase is set to true. If ignorecase is false, this option is
2342          *            ignored.
2343          * @see #FOLD_CASE_DEFAULT
2344          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2345          */
setIgnoreCase(boolean ignorecase, int foldcaseoption)2346         public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
2347             m_ignoreCase_ = ignorecase;
2348             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2349                 throw new IllegalArgumentException("Invalid fold case option");
2350             }
2351             m_foldCase_ = foldcaseoption;
2352         }
2353 
2354         // public getters ----------------------------------------------------
2355 
2356         /**
2357          * Checks if the comparison mode is code point compare.
2358          *
2359          * @return true for code point compare, false for code unit compare
2360          */
getCodePointCompare()2361         public boolean getCodePointCompare() {
2362             return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2363         }
2364 
2365         /**
2366          * Checks if Comparator is in the case insensitive mode.
2367          *
2368          * @return true if Comparator performs case insensitive comparison, false otherwise
2369          */
getIgnoreCase()2370         public boolean getIgnoreCase() {
2371             return m_ignoreCase_;
2372         }
2373 
2374         /**
2375          * Gets the fold case options set in Comparator to be used with case insensitive comparison.
2376          *
2377          * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
2378          * @see #FOLD_CASE_DEFAULT
2379          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2380          */
getIgnoreCaseOption()2381         public int getIgnoreCaseOption() {
2382             return m_foldCase_;
2383         }
2384 
2385         // public other methods ----------------------------------------------
2386 
2387         /**
2388          * Compare two strings depending on the options selected during construction.
2389          *
2390          * @param a first source string.
2391          * @param b second source string.
2392          * @return 0 returned if a == b. If a &lt; b, a negative value is returned. Otherwise if a &gt; b,
2393          *         a positive value is returned.
2394          * @exception ClassCastException thrown when either a or b is not a String object
2395          */
2396         @Override
compare(String a, String b)2397         public int compare(String a, String b) {
2398             if (Utility.sameObjects(a, b)) {
2399                 return 0;
2400             }
2401             if (a == null) {
2402                 return -1;
2403             }
2404             if (b == null) {
2405                 return 1;
2406             }
2407 
2408             if (m_ignoreCase_) {
2409                 return compareCaseInsensitive(a, b);
2410             }
2411             return compareCaseSensitive(a, b);
2412         }
2413 
2414         // private data member ----------------------------------------------
2415 
2416         /**
2417          * Code unit comparison flag. True if code unit comparison is required. False if code point
2418          * comparison is required.
2419          */
2420         private int m_codePointCompare_;
2421 
2422         /**
2423          * Fold case comparison option.
2424          */
2425         private int m_foldCase_;
2426 
2427         /**
2428          * Flag indicator if ignore case is to be used during comparison
2429          */
2430         private boolean m_ignoreCase_;
2431 
2432         /**
2433          * Code point order offset for surrogate characters
2434          */
2435         private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
2436 
2437         // private method ---------------------------------------------------
2438 
2439         /**
2440          * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life
2441          * easier.
2442          *
2443          * @param s1
2444          *            first string to compare
2445          * @param s2
2446          *            second string to compare
2447          * @return -1 is s1 &lt; s2, 0 if equals,
2448          */
compareCaseInsensitive(String s1, String s2)2449         private int compareCaseInsensitive(String s1, String s2) {
2450             return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_
2451                     | Normalizer.COMPARE_IGNORE_CASE);
2452         }
2453 
2454         /**
2455          * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life
2456          * easier.
2457          *
2458          * @param s1
2459          *            first string to compare
2460          * @param s2
2461          *            second string to compare
2462          * @return -1 is s1 &lt; s2, 0 if equals,
2463          */
compareCaseSensitive(String s1, String s2)2464         private int compareCaseSensitive(String s1, String s2) {
2465             // compare identical prefixes - they do not need to be fixed up
2466             // limit1 = start1 + min(lenght1, length2)
2467             int length1 = s1.length();
2468             int length2 = s2.length();
2469             int minlength = length1;
2470             int result = 0;
2471             if (length1 < length2) {
2472                 result = -1;
2473             } else if (length1 > length2) {
2474                 result = 1;
2475                 minlength = length2;
2476             }
2477 
2478             char c1 = 0;
2479             char c2 = 0;
2480             int index = 0;
2481             for (; index < minlength; index++) {
2482                 c1 = s1.charAt(index);
2483                 c2 = s2.charAt(index);
2484                 // check pseudo-limit
2485                 if (c1 != c2) {
2486                     break;
2487                 }
2488             }
2489 
2490             if (index == minlength) {
2491                 return result;
2492             }
2493 
2494             boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2495             // if both values are in or above the surrogate range, fix them up
2496             if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE
2497                     && codepointcompare) {
2498                 // subtract 0x2800 from BMP code points to make them smaller
2499                 // than supplementary ones
2500                 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1)))
2501                         || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) {
2502                     // part of a surrogate pair, leave >=d800
2503                 } else {
2504                     // BMP code point - may be surrogate code point - make
2505                     // < d800
2506                     c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2507                 }
2508 
2509                 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1)))
2510                         || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) {
2511                     // part of a surrogate pair, leave >=d800
2512                 } else {
2513                     // BMP code point - may be surrogate code point - make <d800
2514                     c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2515                 }
2516             }
2517 
2518             // now c1 and c2 are in UTF-32-compatible order
2519             return c1 - c2;
2520         }
2521     }
2522 
2523     /**
2524      * Utility for getting a code point from a CharSequence that contains exactly one code point.
2525      * @return the code point IF the string is non-null and consists of a single code point.
2526      * otherwise returns -1.
2527      * @param s to test
2528      */
getSingleCodePoint(CharSequence s)2529     public static int getSingleCodePoint(CharSequence s) {
2530         if (s == null || s.length() == 0) {
2531             return -1;
2532         } else if (s.length() == 1) {
2533             return s.charAt(0);
2534         } else if (s.length() > 2) {
2535             return -1;
2536         }
2537 
2538         // at this point, len = 2
2539         int cp = Character.codePointAt(s, 0);
2540         if (cp > 0xFFFF) { // is surrogate pair
2541             return cp;
2542         }
2543         return -1;
2544     }
2545 
2546     /**
2547      * Utility for comparing a code point to a string without having to create a new string. Returns the same results
2548      * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if
2549      * <pre>
2550      * sc = new StringComparator(true,false,0);
2551      * fast = UTF16.compareCodePoint(codePoint, charSequence)
2552      * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString())
2553      * </pre>
2554      * then
2555      * <pre>
2556      * Integer.signum(fast) == Integer.signum(slower)
2557      * </pre>
2558      * @param codePoint to test
2559      * @param s to test
2560      * @return equivalent of code point comparator comparing two strings.
2561      */
compareCodePoint(int codePoint, CharSequence s)2562     public static int compareCodePoint(int codePoint, CharSequence s) {
2563         if (s == null) {
2564             return 1;
2565         }
2566         final int strLen = s.length();
2567         if (strLen == 0) {
2568             return 1;
2569         }
2570         int second = Character.codePointAt(s, 0);
2571         int diff = codePoint - second;
2572         if (diff != 0) {
2573             return diff;
2574         }
2575         return strLen == Character.charCount(codePoint) ? 0 : -1;
2576     }
2577 
2578     // private data members -------------------------------------------------
2579 
2580     /**
2581      * Shift value for lead surrogate to form a supplementary character.
2582      */
2583     private static final int LEAD_SURROGATE_SHIFT_ = 10;
2584 
2585     /**
2586      * Mask to retrieve the significant value from a trail surrogate.
2587      */
2588     private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
2589 
2590     /**
2591      * Value that all lead surrogate starts with
2592      */
2593     private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
2594             - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
2595 
2596     // private methods ------------------------------------------------------
2597 
2598     /**
2599      * <p>
2600      * Converts argument code point and returns a String object representing the code point's value
2601      * in UTF16 format.
2602      * </p>
2603      * <p>
2604      * This method does not check for the validity of the codepoint, the results are not guaranteed
2605      * if a invalid codepoint is passed as argument.
2606      * </p>
2607      * <p>
2608      * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise.
2609      * </p>
2610      *
2611      * @param ch
2612      *            code point
2613      * @return string representation of the code point
2614      */
toString(int ch)2615     private static String toString(int ch) {
2616         if (ch < SUPPLEMENTARY_MIN_VALUE) {
2617             return String.valueOf((char) ch);
2618         }
2619 
2620         StringBuilder result = new StringBuilder();
2621         result.append(getLeadSurrogate(ch));
2622         result.append(getTrailSurrogate(ch));
2623         return result.toString();
2624     }
2625 }
2626 // eof
2627