• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 1996-2015, International Business Machines Corporation and    *
6  * others. All Rights Reserved.                                                *
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import java.io.IOException;
12 import java.util.ArrayList;
13 import java.util.Iterator;
14 import java.util.Locale;
15 import java.util.regex.Pattern;
16 
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.text.Replaceable;
19 import com.ibm.icu.text.UTF16;
20 import com.ibm.icu.text.UnicodeMatcher;
21 import com.ibm.icu.util.ICUUncheckedIOException;
22 
23 public final class Utility {
24 
25     private static final char APOSTROPHE = '\'';
26     private static final char BACKSLASH  = '\\';
27     private static final int MAGIC_UNSIGNED = 0x80000000;
28 
29     /**
30      * Convenience utility to compare two Object[]s.
31      * Ought to be in System
32      */
arrayEquals(Object[] source, Object target)33     public final static boolean arrayEquals(Object[] source, Object target) {
34         if (source == null) return (target == null);
35         if (!(target instanceof Object[])) return false;
36         Object[] targ = (Object[]) target;
37         return (source.length == targ.length
38                 && arrayRegionMatches(source, 0, targ, 0, source.length));
39     }
40 
41     /**
42      * Convenience utility to compare two int[]s
43      * Ought to be in System
44      */
arrayEquals(int[] source, Object target)45     public final static boolean arrayEquals(int[] source, Object target) {
46         if (source == null) return (target == null);
47         if (!(target instanceof int[])) return false;
48         int[] targ = (int[]) target;
49         return (source.length == targ.length
50                 && arrayRegionMatches(source, 0, targ, 0, source.length));
51     }
52 
53     /**
54      * Convenience utility to compare two double[]s
55      * Ought to be in System
56      */
arrayEquals(double[] source, Object target)57     public final static boolean arrayEquals(double[] source, Object target) {
58         if (source == null) return (target == null);
59         if (!(target instanceof double[])) return false;
60         double[] targ = (double[]) target;
61         return (source.length == targ.length
62                 && arrayRegionMatches(source, 0, targ, 0, source.length));
63     }
arrayEquals(byte[] source, Object target)64     public final static boolean arrayEquals(byte[] source, Object target) {
65         if (source == null) return (target == null);
66         if (!(target instanceof byte[])) return false;
67         byte[] targ = (byte[]) target;
68         return (source.length == targ.length
69                 && arrayRegionMatches(source, 0, targ, 0, source.length));
70     }
71 
72     /**
73      * Convenience utility to compare two Object[]s
74      * Ought to be in System
75      */
arrayEquals(Object source, Object target)76     public final static boolean arrayEquals(Object source, Object target) {
77         if (source == null) return (target == null);
78         // for some reason, the correct arrayEquals is not being called
79         // so do it by hand for now.
80         if (source instanceof Object[])
81             return(arrayEquals((Object[]) source,target));
82         if (source instanceof int[])
83             return(arrayEquals((int[]) source,target));
84         if (source instanceof double[])
85             return(arrayEquals((double[]) source, target));
86         if (source instanceof byte[])
87             return(arrayEquals((byte[]) source,target));
88         return source.equals(target);
89     }
90 
91     /**
92      * Convenience utility to compare two Object[]s
93      * Ought to be in System.
94      * @param len the length to compare.
95      * The start indices and start+len must be valid.
96      */
arrayRegionMatches(Object[] source, int sourceStart, Object[] target, int targetStart, int len)97     public final static boolean arrayRegionMatches(Object[] source, int sourceStart,
98             Object[] target, int targetStart,
99             int len)
100     {
101         int sourceEnd = sourceStart + len;
102         int delta = targetStart - sourceStart;
103         for (int i = sourceStart; i < sourceEnd; i++) {
104             if (!arrayEquals(source[i],target[i + delta]))
105                 return false;
106         }
107         return true;
108     }
109 
110     /**
111      * Convenience utility to compare two Object[]s
112      * Ought to be in System.
113      * @param len the length to compare.
114      * The start indices and start+len must be valid.
115      */
arrayRegionMatches(char[] source, int sourceStart, char[] target, int targetStart, int len)116     public final static boolean arrayRegionMatches(char[] source, int sourceStart,
117             char[] target, int targetStart,
118             int len)
119     {
120         int sourceEnd = sourceStart + len;
121         int delta = targetStart - sourceStart;
122         for (int i = sourceStart; i < sourceEnd; i++) {
123             if (source[i]!=target[i + delta])
124                 return false;
125         }
126         return true;
127     }
128 
129     /**
130      * Convenience utility to compare two int[]s.
131      * @param len the length to compare.
132      * The start indices and start+len must be valid.
133      * Ought to be in System
134      */
arrayRegionMatches(int[] source, int sourceStart, int[] target, int targetStart, int len)135     public final static boolean arrayRegionMatches(int[] source, int sourceStart,
136             int[] target, int targetStart,
137             int len)
138     {
139         int sourceEnd = sourceStart + len;
140         int delta = targetStart - sourceStart;
141         for (int i = sourceStart; i < sourceEnd; i++) {
142             if (source[i] != target[i + delta])
143                 return false;
144         }
145         return true;
146     }
147 
148     /**
149      * Convenience utility to compare two arrays of doubles.
150      * @param len the length to compare.
151      * The start indices and start+len must be valid.
152      * Ought to be in System
153      */
arrayRegionMatches(double[] source, int sourceStart, double[] target, int targetStart, int len)154     public final static boolean arrayRegionMatches(double[] source, int sourceStart,
155             double[] target, int targetStart,
156             int len)
157     {
158         int sourceEnd = sourceStart + len;
159         int delta = targetStart - sourceStart;
160         for (int i = sourceStart; i < sourceEnd; i++) {
161             if (source[i] != target[i + delta])
162                 return false;
163         }
164         return true;
165     }
arrayRegionMatches(byte[] source, int sourceStart, byte[] target, int targetStart, int len)166     public final static boolean arrayRegionMatches(byte[] source, int sourceStart,
167             byte[] target, int targetStart, int len){
168         int sourceEnd = sourceStart + len;
169         int delta = targetStart - sourceStart;
170         for (int i = sourceStart; i < sourceEnd; i++) {
171             if (source[i] != target[i + delta])
172                 return false;
173         }
174         return true;
175     }
176 
177     /**
178      * Trivial reference equality.
179      * This method should help document that we really want == not equals(),
180      * and to have a single place to suppress warnings from static analysis tools.
181      */
sameObjects(Object a, Object b)182     public static final boolean sameObjects(Object a, Object b) {
183         return a == b;
184     }
185 
186     /**
187      * Convenience utility. Does null checks on objects, then calls compare.
188      */
checkCompare(T a, T b)189     public static <T extends Comparable<T>> int checkCompare(T a, T b) {
190         return a == null ?
191                 b == null ? 0 : -1 :
192                     b == null ? 1 : a.compareTo(b);
193       }
194 
195     /**
196      * Convenience utility. Does null checks on object, then calls hashCode.
197      */
checkHash(Object a)198     public static int checkHash(Object a) {
199         return a == null ? 0 : a.hashCode();
200       }
201 
202     /**
203      * The ESCAPE character is used during run-length encoding.  It signals
204      * a run of identical chars.
205      */
206     private static final char ESCAPE = '\uA5A5';
207 
208     /**
209      * The ESCAPE_BYTE character is used during run-length encoding.  It signals
210      * a run of identical bytes.
211      */
212     static final byte ESCAPE_BYTE = (byte)0xA5;
213 
214     /**
215      * Construct a string representing an int array.  Use run-length encoding.
216      * A character represents itself, unless it is the ESCAPE character.  Then
217      * the following notations are possible:
218      *   ESCAPE ESCAPE   ESCAPE literal
219      *   ESCAPE n c      n instances of character c
220      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
221      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
222      * If we encounter a run where n == ESCAPE, we represent this as:
223      *   c ESCAPE n-1 c
224      * The ESCAPE value is chosen so as not to collide with commonly
225      * seen values.
226      */
arrayToRLEString(int[] a)227     static public final String arrayToRLEString(int[] a) {
228         StringBuilder buffer = new StringBuilder();
229 
230         appendInt(buffer, a.length);
231         int runValue = a[0];
232         int runLength = 1;
233         for (int i=1; i<a.length; ++i) {
234             int s = a[i];
235             if (s == runValue && runLength < 0xFFFF) {
236                 ++runLength;
237             } else {
238                 encodeRun(buffer, runValue, runLength);
239                 runValue = s;
240                 runLength = 1;
241             }
242         }
243         encodeRun(buffer, runValue, runLength);
244         return buffer.toString();
245     }
246 
247     /**
248      * Construct a string representing a short array.  Use run-length encoding.
249      * A character represents itself, unless it is the ESCAPE character.  Then
250      * the following notations are possible:
251      *   ESCAPE ESCAPE   ESCAPE literal
252      *   ESCAPE n c      n instances of character c
253      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
254      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
255      * If we encounter a run where n == ESCAPE, we represent this as:
256      *   c ESCAPE n-1 c
257      * The ESCAPE value is chosen so as not to collide with commonly
258      * seen values.
259      */
arrayToRLEString(short[] a)260     static public final String arrayToRLEString(short[] a) {
261         StringBuilder buffer = new StringBuilder();
262         // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]);
263         buffer.append((char) (a.length >> 16));
264         buffer.append((char) a.length);
265         short runValue = a[0];
266         int runLength = 1;
267         for (int i=1; i<a.length; ++i) {
268             short s = a[i];
269             if (s == runValue && runLength < 0xFFFF) ++runLength;
270             else {
271                 encodeRun(buffer, runValue, runLength);
272                 runValue = s;
273                 runLength = 1;
274             }
275         }
276         encodeRun(buffer, runValue, runLength);
277         return buffer.toString();
278     }
279 
280     /**
281      * Construct a string representing a char array.  Use run-length encoding.
282      * A character represents itself, unless it is the ESCAPE character.  Then
283      * the following notations are possible:
284      *   ESCAPE ESCAPE   ESCAPE literal
285      *   ESCAPE n c      n instances of character c
286      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
287      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
288      * If we encounter a run where n == ESCAPE, we represent this as:
289      *   c ESCAPE n-1 c
290      * The ESCAPE value is chosen so as not to collide with commonly
291      * seen values.
292      */
arrayToRLEString(char[] a)293     static public final String arrayToRLEString(char[] a) {
294         StringBuilder buffer = new StringBuilder();
295         buffer.append((char) (a.length >> 16));
296         buffer.append((char) a.length);
297         char runValue = a[0];
298         int runLength = 1;
299         for (int i=1; i<a.length; ++i) {
300             char s = a[i];
301             if (s == runValue && runLength < 0xFFFF) ++runLength;
302             else {
303                 encodeRun(buffer, (short)runValue, runLength);
304                 runValue = s;
305                 runLength = 1;
306             }
307         }
308         encodeRun(buffer, (short)runValue, runLength);
309         return buffer.toString();
310     }
311 
312     /**
313      * Construct a string representing a byte array.  Use run-length encoding.
314      * Two bytes are packed into a single char, with a single extra zero byte at
315      * the end if needed.  A byte represents itself, unless it is the
316      * ESCAPE_BYTE.  Then the following notations are possible:
317      *   ESCAPE_BYTE ESCAPE_BYTE   ESCAPE_BYTE literal
318      *   ESCAPE_BYTE n b           n instances of byte b
319      * Since an encoded run occupies 3 bytes, we only encode runs of 4 or
320      * more bytes.  Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF.
321      * If we encounter a run where n == ESCAPE_BYTE, we represent this as:
322      *   b ESCAPE_BYTE n-1 b
323      * The ESCAPE_BYTE value is chosen so as not to collide with commonly
324      * seen values.
325      */
arrayToRLEString(byte[] a)326     static public final String arrayToRLEString(byte[] a) {
327         StringBuilder buffer = new StringBuilder();
328         buffer.append((char) (a.length >> 16));
329         buffer.append((char) a.length);
330         byte runValue = a[0];
331         int runLength = 1;
332         byte[] state = new byte[2];
333         for (int i=1; i<a.length; ++i) {
334             byte b = a[i];
335             if (b == runValue && runLength < 0xFF) ++runLength;
336             else {
337                 encodeRun(buffer, runValue, runLength, state);
338                 runValue = b;
339                 runLength = 1;
340             }
341         }
342         encodeRun(buffer, runValue, runLength, state);
343 
344         // We must save the final byte, if there is one, by padding
345         // an extra zero.
346         if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state);
347 
348         return buffer.toString();
349     }
350 
351     /**
352      * Encode a run, possibly a degenerate run (of < 4 values).
353      * @param length The length of the run; must be > 0 && <= 0xFFFF.
354      */
encodeRun(T buffer, int value, int length)355     private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) {
356         if (length < 4) {
357             for (int j=0; j<length; ++j) {
358                 if (value == ESCAPE) {
359                     appendInt(buffer, value);
360                 }
361                 appendInt(buffer, value);
362             }
363         }
364         else {
365             if (length == ESCAPE) {
366                 if (value == ESCAPE) {
367                     appendInt(buffer, ESCAPE);
368                 }
369                 appendInt(buffer, value);
370                 --length;
371             }
372             appendInt(buffer, ESCAPE);
373             appendInt(buffer, length);
374             appendInt(buffer, value); // Don't need to escape this value
375         }
376     }
377 
appendInt(T buffer, int value)378     private static final <T extends Appendable> void appendInt(T buffer, int value) {
379         try {
380             buffer.append((char)(value >>> 16));
381             buffer.append((char)(value & 0xFFFF));
382         } catch (IOException e) {
383             throw new IllegalIcuArgumentException(e);
384         }
385     }
386 
387     /**
388      * Encode a run, possibly a degenerate run (of < 4 values).
389      * @param length The length of the run; must be > 0 && <= 0xFFFF.
390      */
encodeRun(T buffer, short value, int length)391     private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) {
392         try {
393             char valueChar = (char) value;
394             if (length < 4) {
395                 for (int j=0; j<length; ++j) {
396                     if (valueChar == ESCAPE) {
397                         buffer.append(ESCAPE);
398                     }
399                     buffer.append(valueChar);
400                 }
401             }
402             else {
403                 if (length == ESCAPE) {
404                     if (valueChar == ESCAPE) {
405                         buffer.append(ESCAPE);
406                     }
407                     buffer.append(valueChar);
408                     --length;
409                 }
410                 buffer.append(ESCAPE);
411                 buffer.append((char) length);
412                 buffer.append(valueChar); // Don't need to escape this value
413             }
414         } catch (IOException e) {
415             throw new IllegalIcuArgumentException(e);
416         }
417     }
418 
419     /**
420      * Encode a run, possibly a degenerate run (of < 4 values).
421      * @param length The length of the run; must be > 0 && <= 0xFF.
422      */
encodeRun(T buffer, byte value, int length, byte[] state)423     private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length,
424             byte[] state) {
425         if (length < 4) {
426             for (int j=0; j<length; ++j) {
427                 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
428                 appendEncodedByte(buffer, value, state);
429             }
430         }
431         else {
432             if ((byte)length == ESCAPE_BYTE) {
433                 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
434                 appendEncodedByte(buffer, value, state);
435                 --length;
436             }
437             appendEncodedByte(buffer, ESCAPE_BYTE, state);
438             appendEncodedByte(buffer, (byte)length, state);
439             appendEncodedByte(buffer, value, state); // Don't need to escape this value
440         }
441     }
442 
443     /**
444      * Append a byte to the given Appendable, packing two bytes into each
445      * character.  The state parameter maintains intermediary data between
446      * calls.
447      * @param state A two-element array, with state[0] == 0 if this is the
448      * first byte of a pair, or state[0] != 0 if this is the second byte
449      * of a pair, in which case state[1] is the first byte.
450      */
appendEncodedByte(T buffer, byte value, byte[] state)451     private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value,
452             byte[] state) {
453         try {
454             if (state[0] != 0) {
455                 char c = (char) ((state[1] << 8) | ((value) & 0xFF));
456                 buffer.append(c);
457                 state[0] = 0;
458             }
459             else {
460                 state[0] = 1;
461                 state[1] = value;
462             }
463         } catch (IOException e) {
464             throw new IllegalIcuArgumentException(e);
465         }
466     }
467 
468     /**
469      * Construct an array of ints from a run-length encoded string.
470      */
RLEStringToIntArray(String s)471     static public final int[] RLEStringToIntArray(String s) {
472         int length = getInt(s, 0);
473         int[] array = new int[length];
474         int ai = 0, i = 1;
475 
476         int maxI = s.length() / 2;
477         while (ai < length && i < maxI) {
478             int c = getInt(s, i++);
479 
480             if (c == ESCAPE) {
481                 c = getInt(s, i++);
482                 if (c == ESCAPE) {
483                     array[ai++] = c;
484                 } else {
485                     int runLength = c;
486                     int runValue = getInt(s, i++);
487                     for (int j=0; j<runLength; ++j) {
488                         array[ai++] = runValue;
489                     }
490                 }
491             }
492             else {
493                 array[ai++] = c;
494             }
495         }
496 
497         if (ai != length || i != maxI) {
498             throw new IllegalStateException("Bad run-length encoded int array");
499         }
500 
501         return array;
502     }
getInt(String s, int i)503     static final int getInt(String s, int i) {
504         return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1);
505     }
506 
507     /**
508      * Construct an array of shorts from a run-length encoded string.
509      */
RLEStringToShortArray(String s)510     static public final short[] RLEStringToShortArray(String s) {
511         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
512         short[] array = new short[length];
513         int ai = 0;
514         for (int i=2; i<s.length(); ++i) {
515             char c = s.charAt(i);
516             if (c == ESCAPE) {
517                 c = s.charAt(++i);
518                 if (c == ESCAPE) {
519                     array[ai++] = (short) c;
520                 } else {
521                     int runLength = c;
522                     short runValue = (short) s.charAt(++i);
523                     for (int j=0; j<runLength; ++j) array[ai++] = runValue;
524                 }
525             }
526             else {
527                 array[ai++] = (short) c;
528             }
529         }
530 
531         if (ai != length)
532             throw new IllegalStateException("Bad run-length encoded short array");
533 
534         return array;
535     }
536 
537     /**
538      * Construct an array of shorts from a run-length encoded string.
539      */
RLEStringToCharArray(String s)540     static public final char[] RLEStringToCharArray(String s) {
541         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
542         char[] array = new char[length];
543         int ai = 0;
544         for (int i=2; i<s.length(); ++i) {
545             char c = s.charAt(i);
546             if (c == ESCAPE) {
547                 c = s.charAt(++i);
548                 if (c == ESCAPE) {
549                     array[ai++] = c;
550                 } else {
551                     int runLength = c;
552                     char runValue = s.charAt(++i);
553                     for (int j=0; j<runLength; ++j) array[ai++] = runValue;
554                 }
555             }
556             else {
557                 array[ai++] = c;
558             }
559         }
560 
561         if (ai != length)
562             throw new IllegalStateException("Bad run-length encoded short array");
563 
564         return array;
565     }
566 
567     /**
568      * Construct an array of bytes from a run-length encoded string.
569      */
RLEStringToByteArray(String s)570     static public final byte[] RLEStringToByteArray(String s) {
571         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
572         byte[] array = new byte[length];
573         boolean nextChar = true;
574         char c = 0;
575         int node = 0;
576         int runLength = 0;
577         int i = 2;
578         for (int ai=0; ai<length; ) {
579             // This part of the loop places the next byte into the local
580             // variable 'b' each time through the loop.  It keeps the
581             // current character in 'c' and uses the boolean 'nextChar'
582             // to see if we've taken both bytes out of 'c' yet.
583             byte b;
584             if (nextChar) {
585                 c = s.charAt(i++);
586                 b = (byte) (c >> 8);
587                 nextChar = false;
588             }
589             else {
590                 b = (byte) (c & 0xFF);
591                 nextChar = true;
592             }
593 
594             // This part of the loop is a tiny state machine which handles
595             // the parsing of the run-length encoding.  This would be simpler
596             // if we could look ahead, but we can't, so we use 'node' to
597             // move between three nodes in the state machine.
598             switch (node) {
599             case 0:
600                 // Normal idle node
601                 if (b == ESCAPE_BYTE) {
602                     node = 1;
603                 }
604                 else {
605                     array[ai++] = b;
606                 }
607                 break;
608             case 1:
609                 // We have seen one ESCAPE_BYTE; we expect either a second
610                 // one, or a run length and value.
611                 if (b == ESCAPE_BYTE) {
612                     array[ai++] = ESCAPE_BYTE;
613                     node = 0;
614                 }
615                 else {
616                     runLength = b;
617                     // Interpret signed byte as unsigned
618                     if (runLength < 0) runLength += 0x100;
619                     node = 2;
620                 }
621                 break;
622             case 2:
623                 // We have seen an ESCAPE_BYTE and length byte.  We interpret
624                 // the next byte as the value to be repeated.
625                 for (int j=0; j<runLength; ++j) array[ai++] = b;
626                 node = 0;
627                 break;
628             }
629         }
630 
631         if (node != 0)
632             throw new IllegalStateException("Bad run-length encoded byte array");
633 
634         if (i != s.length())
635             throw new IllegalStateException("Excess data in RLE byte array string");
636 
637         return array;
638     }
639 
640     static public String LINE_SEPARATOR = System.getProperty("line.separator");
641 
642     /**
643      * Format a String for representation in a source file.  This includes
644      * breaking it into lines and escaping characters using octal notation
645      * when necessary (control characters and double quotes).
646      */
formatForSource(String s)647     static public final String formatForSource(String s) {
648         StringBuilder buffer = new StringBuilder();
649         for (int i=0; i<s.length();) {
650             if (i > 0) buffer.append('+').append(LINE_SEPARATOR);
651             buffer.append("        \"");
652             int count = 11;
653             while (i<s.length() && count<80) {
654                 char c = s.charAt(i++);
655                 if (c < '\u0020' || c == '"' || c == '\\') {
656                     if (c == '\n') {
657                         buffer.append("\\n");
658                         count += 2;
659                     } else if (c == '\t') {
660                         buffer.append("\\t");
661                         count += 2;
662                     } else if (c == '\r') {
663                         buffer.append("\\r");
664                         count += 2;
665                     } else {
666                         // Represent control characters, backslash and double quote
667                         // using octal notation; otherwise the string we form
668                         // won't compile, since Unicode escape sequences are
669                         // processed before tokenization.
670                         buffer.append('\\');
671                         buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
672                         buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
673                         buffer.append(HEX_DIGIT[(c & 0007)]);
674                         count += 4;
675                     }
676                 }
677                 else if (c <= '\u007E') {
678                     buffer.append(c);
679                     count += 1;
680                 }
681                 else {
682                     buffer.append("\\u");
683                     buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
684                     buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
685                     buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
686                     buffer.append(HEX_DIGIT[(c & 0x000F)]);
687                     count += 6;
688                 }
689             }
690             buffer.append('"');
691         }
692         return buffer.toString();
693     }
694 
695     static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7',
696         '8','9','A','B','C','D','E','F'};
697 
698     /**
699      * Format a String for representation in a source file.  Like
700      * formatForSource but does not do line breaking.
701      */
format1ForSource(String s)702     static public final String format1ForSource(String s) {
703         StringBuilder buffer = new StringBuilder();
704         buffer.append("\"");
705         for (int i=0; i<s.length();) {
706             char c = s.charAt(i++);
707             if (c < '\u0020' || c == '"' || c == '\\') {
708                 if (c == '\n') {
709                     buffer.append("\\n");
710                 } else if (c == '\t') {
711                     buffer.append("\\t");
712                 } else if (c == '\r') {
713                     buffer.append("\\r");
714                 } else {
715                     // Represent control characters, backslash and double quote
716                     // using octal notation; otherwise the string we form
717                     // won't compile, since Unicode escape sequences are
718                     // processed before tokenization.
719                     buffer.append('\\');
720                     buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
721                     buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
722                     buffer.append(HEX_DIGIT[(c & 0007)]);
723                 }
724             }
725             else if (c <= '\u007E') {
726                 buffer.append(c);
727             }
728             else {
729                 buffer.append("\\u");
730                 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
731                 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
732                 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
733                 buffer.append(HEX_DIGIT[(c & 0x000F)]);
734             }
735         }
736         buffer.append('"');
737         return buffer.toString();
738     }
739 
740     /**
741      * Convert characters outside the range U+0020 to U+007F to
742      * Unicode escapes, and convert backslash to a double backslash.
743      */
escape(String s)744     public static final String escape(String s) {
745         StringBuilder buf = new StringBuilder();
746         for (int i=0; i<s.length(); ) {
747             int c = Character.codePointAt(s, i);
748             i += UTF16.getCharCount(c);
749             if (c >= ' ' && c <= 0x007F) {
750                 if (c == '\\') {
751                     buf.append("\\\\"); // That is, "\\"
752                 } else {
753                     buf.append((char)c);
754                 }
755             } else {
756                 boolean four = c <= 0xFFFF;
757                 buf.append(four ? "\\u" : "\\U");
758                 buf.append(hex(c, four ? 4 : 8));
759             }
760         }
761         return buf.toString();
762     }
763 
764     /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
765     static private final char[] UNESCAPE_MAP = {
766         /*"   0x22, 0x22 */
767         /*'   0x27, 0x27 */
768         /*?   0x3F, 0x3F */
769         /*\   0x5C, 0x5C */
770         /*a*/ 0x61, 0x07,
771         /*b*/ 0x62, 0x08,
772         /*e*/ 0x65, 0x1b,
773         /*f*/ 0x66, 0x0c,
774         /*n*/ 0x6E, 0x0a,
775         /*r*/ 0x72, 0x0d,
776         /*t*/ 0x74, 0x09,
777         /*v*/ 0x76, 0x0b
778     };
779 
780     /**
781      * Convert an escape to a 32-bit code point value.  We attempt
782      * to parallel the icu4c unescapeAt() function.
783      * @param offset16 an array containing offset to the character
784      * <em>after</em> the backslash.  Upon return offset16[0] will
785      * be updated to point after the escape sequence.
786      * @return character value from 0 to 10FFFF, or -1 on error.
787      */
unescapeAt(String s, int[] offset16)788     public static int unescapeAt(String s, int[] offset16) {
789         int c;
790         int result = 0;
791         int n = 0;
792         int minDig = 0;
793         int maxDig = 0;
794         int bitsPerDigit = 4;
795         int dig;
796         int i;
797         boolean braces = false;
798 
799         /* Check that offset is in range */
800         int offset = offset16[0];
801         int length = s.length();
802         if (offset < 0 || offset >= length) {
803             return -1;
804         }
805 
806         /* Fetch first UChar after '\\' */
807         c = Character.codePointAt(s, offset);
808         offset += UTF16.getCharCount(c);
809 
810         /* Convert hexadecimal and octal escapes */
811         switch (c) {
812         case 'u':
813             minDig = maxDig = 4;
814             break;
815         case 'U':
816             minDig = maxDig = 8;
817             break;
818         case 'x':
819             minDig = 1;
820             if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
821                 ++offset;
822                 braces = true;
823                 maxDig = 8;
824             } else {
825                 maxDig = 2;
826             }
827             break;
828         default:
829             dig = UCharacter.digit(c, 8);
830             if (dig >= 0) {
831                 minDig = 1;
832                 maxDig = 3;
833                 n = 1; /* Already have first octal digit */
834                 bitsPerDigit = 3;
835                 result = dig;
836             }
837             break;
838         }
839         if (minDig != 0) {
840             while (offset < length && n < maxDig) {
841                 c = UTF16.charAt(s, offset);
842                 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
843                 if (dig < 0) {
844                     break;
845                 }
846                 result = (result << bitsPerDigit) | dig;
847                 offset += UTF16.getCharCount(c);
848                 ++n;
849             }
850             if (n < minDig) {
851                 return -1;
852             }
853             if (braces) {
854                 if (c != 0x7D /*}*/) {
855                     return -1;
856                 }
857                 ++offset;
858             }
859             if (result < 0 || result >= 0x110000) {
860                 return -1;
861             }
862             // If an escape sequence specifies a lead surrogate, see
863             // if there is a trail surrogate after it, either as an
864             // escape or as a literal.  If so, join them up into a
865             // supplementary.
866             if (offset < length &&
867                     UTF16.isLeadSurrogate((char) result)) {
868                 int ahead = offset+1;
869                 c = s.charAt(offset); // [sic] get 16-bit code unit
870                 if (c == '\\' && ahead < length) {
871                     int o[] = new int[] { ahead };
872                     c = unescapeAt(s, o);
873                     ahead = o[0];
874                 }
875                 if (UTF16.isTrailSurrogate((char) c)) {
876                     offset = ahead;
877                     result = Character.toCodePoint((char) result, (char) c);
878                 }
879             }
880             offset16[0] = offset;
881             return result;
882         }
883 
884         /* Convert C-style escapes in table */
885         for (i=0; i<UNESCAPE_MAP.length; i+=2) {
886             if (c == UNESCAPE_MAP[i]) {
887                 offset16[0] = offset;
888                 return UNESCAPE_MAP[i+1];
889             } else if (c < UNESCAPE_MAP[i]) {
890                 break;
891             }
892         }
893 
894         /* Map \cX to control-X: X & 0x1F */
895         if (c == 'c' && offset < length) {
896             c = UTF16.charAt(s, offset);
897             offset16[0] = offset + UTF16.getCharCount(c);
898             return 0x1F & c;
899         }
900 
901         /* If no special forms are recognized, then consider
902          * the backslash to generically escape the next character. */
903         offset16[0] = offset;
904         return c;
905     }
906 
907     /**
908      * Convert all escapes in a given string using unescapeAt().
909      * @exception IllegalArgumentException if an invalid escape is
910      * seen.
911      */
unescape(String s)912     public static String unescape(String s) {
913         StringBuilder buf = new StringBuilder();
914         int[] pos = new int[1];
915         for (int i=0; i<s.length(); ) {
916             char c = s.charAt(i++);
917             if (c == '\\') {
918                 pos[0] = i;
919                 int e = unescapeAt(s, pos);
920                 if (e < 0) {
921                     throw new IllegalArgumentException("Invalid escape sequence " +
922                             s.substring(i-1, Math.min(i+8, s.length())));
923                 }
924                 buf.appendCodePoint(e);
925                 i = pos[0];
926             } else {
927                 buf.append(c);
928             }
929         }
930         return buf.toString();
931     }
932 
933     /**
934      * Convert all escapes in a given string using unescapeAt().
935      * Leave invalid escape sequences unchanged.
936      */
unescapeLeniently(String s)937     public static String unescapeLeniently(String s) {
938         StringBuilder buf = new StringBuilder();
939         int[] pos = new int[1];
940         for (int i=0; i<s.length(); ) {
941             char c = s.charAt(i++);
942             if (c == '\\') {
943                 pos[0] = i;
944                 int e = unescapeAt(s, pos);
945                 if (e < 0) {
946                     buf.append(c);
947                 } else {
948                     buf.appendCodePoint(e);
949                     i = pos[0];
950                 }
951             } else {
952                 buf.append(c);
953             }
954         }
955         return buf.toString();
956     }
957 
958     /**
959      * Convert a char to 4 hex uppercase digits.  E.g., hex('a') =>
960      * "0041".
961      */
hex(long ch)962     public static String hex(long ch) {
963         return hex(ch, 4);
964     }
965 
966     /**
967      * Supplies a zero-padded hex representation of an integer (without 0x)
968      */
hex(long i, int places)969     static public String hex(long i, int places) {
970         if (i == Long.MIN_VALUE) return "-8000000000000000";
971         boolean negative = i < 0;
972         if (negative) {
973             i = -i;
974         }
975         String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
976         if (result.length() < places) {
977             result = "0000000000000000".substring(result.length(),places) + result;
978         }
979         if (negative) {
980             return '-' + result;
981         }
982         return result;
983     }
984 
985     /**
986      * Convert a string to comma-separated groups of 4 hex uppercase
987      * digits.  E.g., hex('ab') => "0041,0042".
988      */
989     public static String hex(CharSequence s) {
990         return hex(s, 4, ",", true, new StringBuilder()).toString();
991     }
992 
993     /**
994      * Convert a string to separated groups of hex uppercase
995      * digits.  E.g., hex('ab'...) => "0041,0042".  Append the output
996      * to the given Appendable.
997      */
998     public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) {
999         try {
1000             if (useCodePoints) {
1001                 int cp;
1002                 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1003                     cp = Character.codePointAt(s, i);
1004                     if (i != 0) {
1005                         result.append(separator);
1006                     }
1007                     result.append(hex(cp,width));
1008                 }
1009             } else {
1010                 for (int i = 0; i < s.length(); ++i) {
1011                     if (i != 0) {
1012                         result.append(separator);
1013                     }
1014                     result.append(hex(s.charAt(i),width));
1015                 }
1016             }
1017             return result;
1018         } catch (IOException e) {
1019             throw new IllegalIcuArgumentException(e);
1020         }
1021     }
1022 
1023     public static String hex(byte[] o, int start, int end, String separator) {
1024         StringBuilder result = new StringBuilder();
1025         //int ch;
1026         for (int i = start; i < end; ++i) {
1027           if (i != 0) result.append(separator);
1028           result.append(hex(o[i]));
1029         }
1030         return result.toString();
1031       }
1032 
1033     /**
1034      * Convert a string to comma-separated groups of 4 hex uppercase
1035      * digits.  E.g., hex('ab') => "0041,0042".
1036      */
1037     public static <S extends CharSequence> String hex(S s, int width, S separator) {
1038         return hex(s, width, separator, true, new StringBuilder()).toString();
1039     }
1040 
1041     /**
1042      * Split a string into pieces based on the given divider character
1043      * @param s the string to split
1044      * @param divider the character on which to split.  Occurrences of
1045      * this character are not included in the output
1046      * @param output an array to receive the substrings between
1047      * instances of divider.  It must be large enough on entry to
1048      * accommodate all output.  Adjacent instances of the divider
1049      * character will place empty strings into output.  Before
1050      * returning, output is padded out with empty strings.
1051      */
1052     public static void split(String s, char divider, String[] output) {
1053         int last = 0;
1054         int current = 0;
1055         int i;
1056         for (i = 0; i < s.length(); ++i) {
1057             if (s.charAt(i) == divider) {
1058                 output[current++] = s.substring(last,i);
1059                 last = i+1;
1060             }
1061         }
1062         output[current++] = s.substring(last,i);
1063         while (current < output.length) {
1064             output[current++] = "";
1065         }
1066     }
1067 
1068     /**
1069      * Split a string into pieces based on the given divider character
1070      * @param s the string to split
1071      * @param divider the character on which to split.  Occurrences of
1072      * this character are not included in the output
1073      * @return output an array to receive the substrings between
1074      * instances of divider. Adjacent instances of the divider
1075      * character will place empty strings into output.
1076      */
1077     public static String[] split(String s, char divider) {
1078         int last = 0;
1079         int i;
1080         ArrayList<String> output = new ArrayList<>();
1081         for (i = 0; i < s.length(); ++i) {
1082             if (s.charAt(i) == divider) {
1083                 output.add(s.substring(last,i));
1084                 last = i+1;
1085             }
1086         }
1087         output.add( s.substring(last,i));
1088         return output.toArray(new String[output.size()]);
1089     }
1090 
1091     /**
1092      * Look up a given string in a string array.  Returns the index at
1093      * which the first occurrence of the string was found in the
1094      * array, or -1 if it was not found.
1095      * @param source the string to search for
1096      * @param target the array of zero or more strings in which to
1097      * look for source
1098      * @return the index of target at which source first occurs, or -1
1099      * if not found
1100      */
1101     public static int lookup(String source, String[] target) {
1102         for (int i = 0; i < target.length; ++i) {
1103             if (source.equals(target[i])) return i;
1104         }
1105         return -1;
1106     }
1107 
1108     /**
1109      * Parse a single non-whitespace character 'ch', optionally
1110      * preceded by whitespace.
1111      * @param id the string to be parsed
1112      * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
1113      * offset of the first character to be parsed.  On output, pos[0]
1114      * is the index after the last parsed character.  If the parse
1115      * fails, pos[0] will be unchanged.
1116      * @param ch the non-whitespace character to be parsed.
1117      * @return true if 'ch' is seen preceded by zero or more
1118      * whitespace characters.
1119      */
1120     public static boolean parseChar(String id, int[] pos, char ch) {
1121         int start = pos[0];
1122         pos[0] = PatternProps.skipWhiteSpace(id, pos[0]);
1123         if (pos[0] == id.length() ||
1124                 id.charAt(pos[0]) != ch) {
1125             pos[0] = start;
1126             return false;
1127         }
1128         ++pos[0];
1129         return true;
1130     }
1131 
1132     /**
1133      * Parse a pattern string starting at offset pos.  Keywords are
1134      * matched case-insensitively.  Spaces may be skipped and may be
1135      * optional or required.  Integer values may be parsed, and if
1136      * they are, they will be returned in the given array.  If
1137      * successful, the offset of the next non-space character is
1138      * returned.  On failure, -1 is returned.
1139      * @param pattern must only contain lowercase characters, which
1140      * will match their uppercase equivalents as well.  A space
1141      * character matches one or more required spaces.  A '~' character
1142      * matches zero or more optional spaces.  A '#' character matches
1143      * an integer and stores it in parsedInts, which the caller must
1144      * ensure has enough capacity.
1145      * @param parsedInts array to receive parsed integers.  Caller
1146      * must ensure that parsedInts.length is >= the number of '#'
1147      * signs in 'pattern'.
1148      * @return the position after the last character parsed, or -1 if
1149      * the parse failed
1150      */
1151     @SuppressWarnings("fallthrough")
1152     public static int parsePattern(String rule, int pos, int limit,
1153             String pattern, int[] parsedInts) {
1154         // TODO Update this to handle surrogates
1155         int[] p = new int[1];
1156         int intCount = 0; // number of integers parsed
1157         for (int i=0; i<pattern.length(); ++i) {
1158             char cpat = pattern.charAt(i);
1159             char c;
1160             switch (cpat) {
1161             case ' ':
1162                 if (pos >= limit) {
1163                     return -1;
1164                 }
1165                 c = rule.charAt(pos++);
1166                 if (!PatternProps.isWhiteSpace(c)) {
1167                     return -1;
1168                 }
1169                 // FALL THROUGH to skipWhitespace
1170             case '~':
1171                 pos = PatternProps.skipWhiteSpace(rule, pos);
1172                 break;
1173             case '#':
1174                 p[0] = pos;
1175                 parsedInts[intCount++] = parseInteger(rule, p, limit);
1176                 if (p[0] == pos) {
1177                     // Syntax error; failed to parse integer
1178                     return -1;
1179                 }
1180                 pos = p[0];
1181                 break;
1182             default:
1183                 if (pos >= limit) {
1184                     return -1;
1185                 }
1186                 c = (char) UCharacter.toLowerCase(rule.charAt(pos++));
1187                 if (c != cpat) {
1188                     return -1;
1189                 }
1190                 break;
1191             }
1192         }
1193         return pos;
1194     }
1195 
1196     /**
1197      * Parse a pattern string within the given Replaceable and a parsing
1198      * pattern.  Characters are matched literally and case-sensitively
1199      * except for the following special characters:
1200      *
1201      * ~  zero or more Pattern_White_Space chars
1202      *
1203      * If end of pattern is reached with all matches along the way,
1204      * pos is advanced to the first unparsed index and returned.
1205      * Otherwise -1 is returned.
1206      * @param pat pattern that controls parsing
1207      * @param text text to be parsed, starting at index
1208      * @param index offset to first character to parse
1209      * @param limit offset after last character to parse
1210      * @return index after last parsed character, or -1 on parse failure.
1211      */
1212     public static int parsePattern(String pat,
1213             Replaceable text,
1214             int index,
1215             int limit) {
1216         int ipat = 0;
1217 
1218         // empty pattern matches immediately
1219         if (ipat == pat.length()) {
1220             return index;
1221         }
1222 
1223         int cpat = Character.codePointAt(pat, ipat);
1224 
1225         while (index < limit) {
1226             int c = text.char32At(index);
1227 
1228             // parse \s*
1229             if (cpat == '~') {
1230                 if (PatternProps.isWhiteSpace(c)) {
1231                     index += UTF16.getCharCount(c);
1232                     continue;
1233                 } else {
1234                     if (++ipat == pat.length()) {
1235                         return index; // success; c unparsed
1236                     }
1237                     // fall thru; process c again with next cpat
1238                 }
1239             }
1240 
1241             // parse literal
1242             else if (c == cpat) {
1243                 int n = UTF16.getCharCount(c);
1244                 index += n;
1245                 ipat += n;
1246                 if (ipat == pat.length()) {
1247                     return index; // success; c parsed
1248                 }
1249                 // fall thru; get next cpat
1250             }
1251 
1252             // match failure of literal
1253             else {
1254                 return -1;
1255             }
1256 
1257             cpat = UTF16.charAt(pat, ipat);
1258         }
1259 
1260         return -1; // text ended before end of pat
1261     }
1262 
1263     /**
1264      * Parse an integer at pos, either of the form \d+ or of the form
1265      * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
1266      * or octal format.
1267      * @param pos INPUT-OUTPUT parameter.  On input, the first
1268      * character to parse.  On output, the character after the last
1269      * parsed character.
1270      */
1271     public static int parseInteger(String rule, int[] pos, int limit) {
1272         int count = 0;
1273         int value = 0;
1274         int p = pos[0];
1275         int radix = 10;
1276 
1277         if (rule.regionMatches(true, p, "0x", 0, 2)) {
1278             p += 2;
1279             radix = 16;
1280         } else if (p < limit && rule.charAt(p) == '0') {
1281             p++;
1282             count = 1;
1283             radix = 8;
1284         }
1285 
1286         while (p < limit) {
1287             int d = UCharacter.digit(rule.charAt(p++), radix);
1288             if (d < 0) {
1289                 --p;
1290                 break;
1291             }
1292             ++count;
1293             int v = (value * radix) + d;
1294             if (v <= value) {
1295                 // If there are too many input digits, at some point
1296                 // the value will go negative, e.g., if we have seen
1297                 // "0x8000000" already and there is another '0', when
1298                 // we parse the next 0 the value will go negative.
1299                 return 0;
1300             }
1301             value = v;
1302         }
1303         if (count > 0) {
1304             pos[0] = p;
1305         }
1306         return value;
1307     }
1308 
1309     /**
1310      * Parse a Unicode identifier from the given string at the given
1311      * position.  Return the identifier, or null if there is no
1312      * identifier.
1313      * @param str the string to parse
1314      * @param pos INPUT-OUPUT parameter.  On INPUT, pos[0] is the
1315      * first character to examine.  It must be less than str.length(),
1316      * and it must not point to a whitespace character.  That is, must
1317      * have pos[0] < str.length().  On
1318      * OUTPUT, the position after the last parsed character.
1319      * @return the Unicode identifier, or null if there is no valid
1320      * identifier at pos[0].
1321      */
1322     public static String parseUnicodeIdentifier(String str, int[] pos) {
1323         // assert(pos[0] < str.length());
1324         StringBuilder buf = new StringBuilder();
1325         int p = pos[0];
1326         while (p < str.length()) {
1327             int ch = Character.codePointAt(str, p);
1328             if (buf.length() == 0) {
1329                 if (UCharacter.isUnicodeIdentifierStart(ch)) {
1330                     buf.appendCodePoint(ch);
1331                 } else {
1332                     return null;
1333                 }
1334             } else {
1335                 if (UCharacter.isUnicodeIdentifierPart(ch)) {
1336                     buf.appendCodePoint(ch);
1337                 } else {
1338                     break;
1339                 }
1340             }
1341             p += UTF16.getCharCount(ch);
1342         }
1343         pos[0] = p;
1344         return buf.toString();
1345     }
1346 
1347     static final char DIGITS[] = {
1348         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
1349         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
1350         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
1351         'U', 'V', 'W', 'X', 'Y', 'Z'
1352     };
1353 
1354     /**
1355      * Append the digits of a positive integer to the given
1356      * <code>Appendable</code> in the given radix. This is
1357      * done recursively since it is easiest to generate the low-
1358      * order digit first, but it must be appended last.
1359      *
1360      * @param result is the <code>Appendable</code> to append to
1361      * @param n is the positive integer
1362      * @param radix is the radix, from 2 to 36 inclusive
1363      * @param minDigits is the minimum number of digits to append.
1364      */
1365     private static <T extends Appendable> void recursiveAppendNumber(T result, int n,
1366             int radix, int minDigits)
1367     {
1368         try {
1369             int digit = n % radix;
1370 
1371             if (n >= radix || minDigits > 1) {
1372                 recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
1373             }
1374             result.append(DIGITS[digit]);
1375         } catch (IOException e) {
1376             throw new IllegalIcuArgumentException(e);
1377         }
1378     }
1379 
1380     /**
1381      * Append a number to the given Appendable in the given radix.
1382      * Standard digits '0'-'9' are used and letters 'A'-'Z' for
1383      * radices 11 through 36.
1384      * @param result the digits of the number are appended here
1385      * @param n the number to be converted to digits; may be negative.
1386      * If negative, a '-' is prepended to the digits.
1387      * @param radix a radix from 2 to 36 inclusive.
1388      * @param minDigits the minimum number of digits, not including
1389      * any '-', to produce.  Values less than 2 have no effect.  One
1390      * digit is always emitted regardless of this parameter.
1391      * @return a reference to result
1392      */
1393     public static <T extends Appendable> T appendNumber(T result, int n,
1394             int radix, int minDigits)
1395     {
1396         try {
1397             if (radix < 2 || radix > 36) {
1398                 throw new IllegalArgumentException("Illegal radix " + radix);
1399             }
1400 
1401 
1402             int abs = n;
1403 
1404             if (n < 0) {
1405                 abs = -n;
1406                 result.append("-");
1407             }
1408 
1409             recursiveAppendNumber(result, abs, radix, minDigits);
1410 
1411             return result;
1412         } catch (IOException e) {
1413             throw new IllegalIcuArgumentException(e);
1414         }
1415 
1416     }
1417 
1418     /**
1419      * Parse an unsigned 31-bit integer at the given offset.  Use
1420      * UCharacter.digit() to parse individual characters into digits.
1421      * @param text the text to be parsed
1422      * @param pos INPUT-OUTPUT parameter.  On entry, pos[0] is the
1423      * offset within text at which to start parsing; it should point
1424      * to a valid digit.  On exit, pos[0] is the offset after the last
1425      * parsed character.  If the parse failed, it will be unchanged on
1426      * exit.  Must be >= 0 on entry.
1427      * @param radix the radix in which to parse; must be >= 2 and <=
1428      * 36.
1429      * @return a non-negative parsed number, or -1 upon parse failure.
1430      * Parse fails if there are no digits, that is, if pos[0] does not
1431      * point to a valid digit on entry, or if the number to be parsed
1432      * does not fit into a 31-bit unsigned integer.
1433      */
1434     public static int parseNumber(String text, int[] pos, int radix) {
1435         // assert(pos[0] >= 0);
1436         // assert(radix >= 2);
1437         // assert(radix <= 36);
1438         int n = 0;
1439         int p = pos[0];
1440         while (p < text.length()) {
1441             int ch = Character.codePointAt(text, p);
1442             int d = UCharacter.digit(ch, radix);
1443             if (d < 0) {
1444                 break;
1445             }
1446             n = radix*n + d;
1447             // ASSUME that when a 32-bit integer overflows it becomes
1448             // negative.  E.g., 214748364 * 10 + 8 => negative value.
1449             if (n < 0) {
1450                 return -1;
1451             }
1452             ++p;
1453         }
1454         if (p == pos[0]) {
1455             return -1;
1456         }
1457         pos[0] = p;
1458         return n;
1459     }
1460 
1461     /**
1462      * Return true if the character is NOT printable ASCII.  The tab,
1463      * newline and linefeed characters are considered unprintable.
1464      */
1465     public static boolean isUnprintable(int c) {
1466         //0x20 = 32 and 0x7E = 126
1467         return !(c >= 0x20 && c <= 0x7E);
1468     }
1469 
1470     /**
1471      * Escape unprintable characters using <backslash>uxxxx notation
1472      * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
1473      * above.  If the character is printable ASCII, then do nothing
1474      * and return FALSE.  Otherwise, append the escaped notation and
1475      * return TRUE.
1476      */
1477     public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
1478         try {
1479             if (isUnprintable(c)) {
1480                 result.append('\\');
1481                 if ((c & ~0xFFFF) != 0) {
1482                     result.append('U');
1483                     result.append(DIGITS[0xF&(c>>28)]);
1484                     result.append(DIGITS[0xF&(c>>24)]);
1485                     result.append(DIGITS[0xF&(c>>20)]);
1486                     result.append(DIGITS[0xF&(c>>16)]);
1487                 } else {
1488                     result.append('u');
1489                 }
1490                 result.append(DIGITS[0xF&(c>>12)]);
1491                 result.append(DIGITS[0xF&(c>>8)]);
1492                 result.append(DIGITS[0xF&(c>>4)]);
1493                 result.append(DIGITS[0xF&c]);
1494                 return true;
1495             }
1496             return false;
1497         } catch (IOException e) {
1498             throw new IllegalIcuArgumentException(e);
1499         }
1500     }
1501 
1502     /**
1503      * Returns the index of the first character in a set, ignoring quoted text.
1504      * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
1505      * found by a search for "h".  Unlike String.indexOf(), this method searches
1506      * not for a single character, but for any character of the string
1507      * <code>setOfChars</code>.
1508      * @param text text to be searched
1509      * @param start the beginning index, inclusive; <code>0 <= start
1510      * <= limit</code>.
1511      * @param limit the ending index, exclusive; <code>start <= limit
1512      * <= text.length()</code>.
1513      * @param setOfChars string with one or more distinct characters
1514      * @return Offset of the first character in <code>setOfChars</code>
1515      * found, or -1 if not found.
1516      * @see String#indexOf
1517      */
1518     public static int quotedIndexOf(String text, int start, int limit,
1519             String setOfChars) {
1520         for (int i=start; i<limit; ++i) {
1521             char c = text.charAt(i);
1522             if (c == BACKSLASH) {
1523                 ++i;
1524             } else if (c == APOSTROPHE) {
1525                 while (++i < limit
1526                         && text.charAt(i) != APOSTROPHE) {}
1527             } else if (setOfChars.indexOf(c) >= 0) {
1528                 return i;
1529             }
1530         }
1531         return -1;
1532     }
1533 
1534     /**
1535      * Append a character to a rule that is being built up.  To flush
1536      * the quoteBuf to rule, make one final call with isLiteral == true.
1537      * If there is no final character, pass in (int)-1 as c.
1538      * @param rule the string to append the character to
1539      * @param c the character to append, or (int)-1 if none.
1540      * @param isLiteral if true, then the given character should not be
1541      * quoted or escaped.  Usually this means it is a syntactic element
1542      * such as > or $
1543      * @param escapeUnprintable if true, then unprintable characters
1544      * should be escaped using escapeUnprintable().  These escapes will
1545      * appear outside of quotes.
1546      * @param quoteBuf a buffer which is used to build up quoted
1547      * substrings.  The caller should initially supply an empty buffer,
1548      * and thereafter should not modify the buffer.  The buffer should be
1549      * cleared out by, at the end, calling this method with a literal
1550      * character (which may be -1).
1551      */
1552     public static void appendToRule(StringBuffer rule,
1553             int c,
1554             boolean isLiteral,
1555             boolean escapeUnprintable,
1556             StringBuffer quoteBuf) {
1557         // If we are escaping unprintables, then escape them outside
1558         // quotes.  \\u and \\U are not recognized within quotes.  The same
1559         // logic applies to literals, but literals are never escaped.
1560         if (isLiteral ||
1561                 (escapeUnprintable && Utility.isUnprintable(c))) {
1562             if (quoteBuf.length() > 0) {
1563                 // We prefer backslash APOSTROPHE to double APOSTROPHE
1564                 // (more readable, less similar to ") so if there are
1565                 // double APOSTROPHEs at the ends, we pull them outside
1566                 // of the quote.
1567 
1568                 // If the first thing in the quoteBuf is APOSTROPHE
1569                 // (doubled) then pull it out.
1570                 while (quoteBuf.length() >= 2 &&
1571                         quoteBuf.charAt(0) == APOSTROPHE &&
1572                         quoteBuf.charAt(1) == APOSTROPHE) {
1573                     rule.append(BACKSLASH).append(APOSTROPHE);
1574                     quoteBuf.delete(0, 2);
1575                 }
1576                 // If the last thing in the quoteBuf is APOSTROPHE
1577                 // (doubled) then remove and count it and add it after.
1578                 int trailingCount = 0;
1579                 while (quoteBuf.length() >= 2 &&
1580                         quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
1581                         quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
1582                     quoteBuf.setLength(quoteBuf.length()-2);
1583                     ++trailingCount;
1584                 }
1585                 if (quoteBuf.length() > 0) {
1586                     rule.append(APOSTROPHE);
1587                     rule.append(quoteBuf);
1588                     rule.append(APOSTROPHE);
1589                     quoteBuf.setLength(0);
1590                 }
1591                 while (trailingCount-- > 0) {
1592                     rule.append(BACKSLASH).append(APOSTROPHE);
1593                 }
1594             }
1595             if (c != -1) {
1596                 /* Since spaces are ignored during parsing, they are
1597                  * emitted only for readability.  We emit one here
1598                  * only if there isn't already one at the end of the
1599                  * rule.
1600                  */
1601                 if (c == ' ') {
1602                     int len = rule.length();
1603                     if (len > 0 && rule.charAt(len-1) != ' ') {
1604                         rule.append(' ');
1605                     }
1606                 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) {
1607                     rule.appendCodePoint(c);
1608                 }
1609             }
1610         }
1611 
1612         // Escape ' and '\' and don't begin a quote just for them
1613         else if (quoteBuf.length() == 0 &&
1614                 (c == APOSTROPHE || c == BACKSLASH)) {
1615             rule.append(BACKSLASH).append((char)c);
1616         }
1617 
1618         // Specials (printable ascii that isn't [0-9a-zA-Z]) and
1619         // whitespace need quoting.  Also append stuff to quotes if we are
1620         // building up a quoted substring already.
1621         else if (quoteBuf.length() > 0 ||
1622                 (c >= 0x0021 && c <= 0x007E &&
1623                         !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
1624                                 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
1625                                 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
1626                                 PatternProps.isWhiteSpace(c)) {
1627             quoteBuf.appendCodePoint(c);
1628             // Double ' within a quote
1629             if (c == APOSTROPHE) {
1630                 quoteBuf.append((char)c);
1631             }
1632         }
1633 
1634         // Otherwise just append
1635         else {
1636             rule.appendCodePoint(c);
1637         }
1638     }
1639 
1640     /**
1641      * Append the given string to the rule.  Calls the single-character
1642      * version of appendToRule for each character.
1643      */
1644     public static void appendToRule(StringBuffer rule,
1645             String text,
1646             boolean isLiteral,
1647             boolean escapeUnprintable,
1648             StringBuffer quoteBuf) {
1649         for (int i=0; i<text.length(); ++i) {
1650             // Okay to process in 16-bit code units here
1651             appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
1652         }
1653     }
1654 
1655     /**
1656      * Given a matcher reference, which may be null, append its
1657      * pattern as a literal to the given rule.
1658      */
1659     public static void appendToRule(StringBuffer rule,
1660             UnicodeMatcher matcher,
1661             boolean escapeUnprintable,
1662             StringBuffer quoteBuf) {
1663         if (matcher != null) {
1664             appendToRule(rule, matcher.toPattern(escapeUnprintable),
1665                     true, escapeUnprintable, quoteBuf);
1666         }
1667     }
1668 
1669     /**
1670      * Compares 2 unsigned integers
1671      * @param source 32 bit unsigned integer
1672      * @param target 32 bit unsigned integer
1673      * @return 0 if equals, 1 if source is greater than target and -1
1674      *         otherwise
1675      */
1676     public static final int compareUnsigned(int source, int target)
1677     {
1678         source += MAGIC_UNSIGNED;
1679         target += MAGIC_UNSIGNED;
1680         if (source < target) {
1681             return -1;
1682         }
1683         else if (source > target) {
1684             return 1;
1685         }
1686         return 0;
1687     }
1688 
1689     /**
1690      * Find the highest bit in a positive integer. This is done
1691      * by doing a binary search through the bits.
1692      *
1693      * @param n is the integer
1694      *
1695      * @return the bit number of the highest bit, with 0 being
1696      * the low order bit, or -1 if <code>n</code> is not positive
1697      */
1698     public static final byte highBit(int n)
1699     {
1700         if (n <= 0) {
1701             return -1;
1702         }
1703 
1704         byte bit = 0;
1705 
1706         if (n >= 1 << 16) {
1707             n >>= 16;
1708         bit += 16;
1709         }
1710 
1711         if (n >= 1 << 8) {
1712             n >>= 8;
1713         bit += 8;
1714         }
1715 
1716         if (n >= 1 << 4) {
1717             n >>= 4;
1718         bit += 4;
1719         }
1720 
1721         if (n >= 1 << 2) {
1722             n >>= 2;
1723         bit += 2;
1724         }
1725 
1726         if (n >= 1 << 1) {
1727             n >>= 1;
1728         bit += 1;
1729         }
1730 
1731         return bit;
1732     }
1733     /**
1734      * Utility method to take a int[] containing codepoints and return
1735      * a string representation with code units.
1736      */
1737     public static String valueOf(int[]source){
1738         // TODO: Investigate why this method is not on UTF16 class
1739         StringBuilder result = new StringBuilder(source.length);
1740         for(int i=0; i<source.length; i++){
1741             result.appendCodePoint(source[i]);
1742         }
1743         return result.toString();
1744     }
1745 
1746 
1747     /**
1748      * Utility to duplicate a string count times
1749      * @param s String to be duplicated.
1750      * @param count Number of times to duplicate a string.
1751      */
1752     public static String repeat(String s, int count) {
1753         if (count <= 0) return "";
1754         if (count == 1) return s;
1755         StringBuilder result = new StringBuilder();
1756         for (int i = 0; i < count; ++i) {
1757             result.append(s);
1758         }
1759         return result.toString();
1760     }
1761 
1762     public static String[] splitString(String src, String target) {
1763         return src.split("\\Q" + target + "\\E");
1764     }
1765 
1766     /**
1767      * Split the string at runs of ascii whitespace characters.
1768      */
1769     public static String[] splitWhitespace(String src) {
1770         return src.split("\\s+");
1771     }
1772 
1773     /**
1774      * Parse a list of hex numbers and return a string
1775      * @param string String of hex numbers.
1776      * @param minLength Minimal length.
1777      * @param separator Separator.
1778      * @return A string from hex numbers.
1779      */
1780     public static String fromHex(String string, int minLength, String separator) {
1781         return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+"));
1782     }
1783 
1784     /**
1785      * Parse a list of hex numbers and return a string
1786      * @param string String of hex numbers.
1787      * @param minLength Minimal length.
1788      * @param separator Separator.
1789      * @return A string from hex numbers.
1790      */
1791     public static String fromHex(String string, int minLength, Pattern separator) {
1792         StringBuilder buffer = new StringBuilder();
1793         String[] parts = separator.split(string);
1794         for (String part : parts) {
1795             if (part.length() < minLength) {
1796                 throw new IllegalArgumentException("code point too short: " + part);
1797             }
1798             int cp = Integer.parseInt(part, 16);
1799             buffer.appendCodePoint(cp);
1800         }
1801         return buffer.toString();
1802     }
1803 
1804     /**
1805      * This implementation is equivalent to Java 8+ Math#addExact(int, int)
1806      * @param x the first value
1807      * @param y the second value
1808      * @return the result
1809      */
1810     public static int addExact(int x, int y) {
1811         int r = x + y;
1812         // HD 2-12 Overflow iff both arguments have the opposite sign of the result
1813         if (((x ^ r) & (y ^ r)) < 0) {
1814             throw new ArithmeticException("integer overflow");
1815         }
1816         return r;
1817     }
1818 
1819     /**
1820      * Returns whether the chars in the two CharSequences are equal.
1821      */
1822     public static boolean charSequenceEquals(CharSequence a, CharSequence b) {
1823         if (a == b) {
1824             return true;
1825         }
1826         if (a == null || b == null) {
1827             return false;
1828         }
1829         if (a.length() != b.length()) {
1830             return false;
1831         }
1832         for (int i = 0; i < a.length(); i++) {
1833             if (a.charAt(i) != b.charAt(i))
1834                 return false;
1835         }
1836         return true;
1837     }
1838 
1839     /**
1840      * Returns a hash code for a CharSequence that is equivalent to calling
1841      * charSequence.toString().hashCode()
1842      */
1843     public static int charSequenceHashCode(CharSequence value) {
1844         int hash = 0;
1845         for (int i = 0; i < value.length(); i++) {
1846             hash = hash * 31 + value.charAt(i);
1847         }
1848         return hash;
1849     }
1850 
1851     /**
1852      * Appends a CharSequence to an Appendable, converting IOException to ICUUncheckedIOException.
1853      */
1854     public static <A extends Appendable> A appendTo(CharSequence string, A appendable) {
1855         try {
1856             appendable.append(string);
1857             return appendable;
1858         } catch (IOException e) {
1859             throw new ICUUncheckedIOException(e);
1860         }
1861     }
1862 
1863     /**
1864      * Java 8+ String#join(CharSequence, Iterable<? extends CharSequence>) compatible method for Java 7 env.
1865      * @param delimiter the delimiter that separates each element
1866      * @param elements the elements to join together.
1867      * @return a new String that is composed of the elements separated by the delimiter
1868      * @throws NullPointerException If delimiter or elements is null
1869      */
1870     public static String joinStrings(CharSequence delimiter, Iterable<? extends CharSequence> elements) {
1871         if (delimiter == null || elements == null) {
1872             throw new NullPointerException("Delimiter or elements is null");
1873         }
1874         StringBuilder buf = new StringBuilder();
1875         Iterator<? extends CharSequence> itr = elements.iterator();
1876         boolean isFirstElem = true;
1877         while (itr.hasNext()) {
1878             CharSequence element = itr.next();
1879             if (element != null) {
1880                 if (!isFirstElem) {
1881                     buf.append(delimiter);
1882                 } else {
1883                     isFirstElem = false;
1884                 }
1885                 buf.append(element);
1886             }
1887         }
1888         return buf.toString();
1889     }
1890 }
1891