• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 1996-2015, International Business Machines Corporation and    *
6  * others. All Rights Reserved.                                                *
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import java.io.IOException;
12 import java.util.ArrayList;
13 import java.util.Iterator;
14 import java.util.Locale;
15 import java.util.regex.Pattern;
16 
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.text.Replaceable;
19 import com.ibm.icu.text.UTF16;
20 import com.ibm.icu.text.UnicodeMatcher;
21 import com.ibm.icu.util.ICUUncheckedIOException;
22 
23 public final class Utility {
24 
25     private static final char APOSTROPHE = '\'';
26     private static final char BACKSLASH  = '\\';
27     private static final int MAGIC_UNSIGNED = 0x80000000;
28 
29     /**
30      * Convenience utility to compare two Object[]s.
31      * Ought to be in System
32      */
arrayEquals(Object[] source, Object target)33     public final static boolean arrayEquals(Object[] source, Object target) {
34         if (source == null) return (target == null);
35         if (!(target instanceof Object[])) return false;
36         Object[] targ = (Object[]) target;
37         return (source.length == targ.length
38                 && arrayRegionMatches(source, 0, targ, 0, source.length));
39     }
40 
41     /**
42      * Convenience utility to compare two int[]s
43      * Ought to be in System
44      */
arrayEquals(int[] source, Object target)45     public final static boolean arrayEquals(int[] source, Object target) {
46         if (source == null) return (target == null);
47         if (!(target instanceof int[])) return false;
48         int[] targ = (int[]) target;
49         return (source.length == targ.length
50                 && arrayRegionMatches(source, 0, targ, 0, source.length));
51     }
52 
53     /**
54      * Convenience utility to compare two double[]s
55      * Ought to be in System
56      */
arrayEquals(double[] source, Object target)57     public final static boolean arrayEquals(double[] source, Object target) {
58         if (source == null) return (target == null);
59         if (!(target instanceof double[])) return false;
60         double[] targ = (double[]) target;
61         return (source.length == targ.length
62                 && arrayRegionMatches(source, 0, targ, 0, source.length));
63     }
arrayEquals(byte[] source, Object target)64     public final static boolean arrayEquals(byte[] source, Object target) {
65         if (source == null) return (target == null);
66         if (!(target instanceof byte[])) return false;
67         byte[] targ = (byte[]) target;
68         return (source.length == targ.length
69                 && arrayRegionMatches(source, 0, targ, 0, source.length));
70     }
71 
72     /**
73      * Convenience utility to compare two Object[]s
74      * Ought to be in System
75      */
arrayEquals(Object source, Object target)76     public final static boolean arrayEquals(Object source, Object target) {
77         if (source == null) return (target == null);
78         // for some reason, the correct arrayEquals is not being called
79         // so do it by hand for now.
80         if (source instanceof Object[])
81             return(arrayEquals((Object[]) source,target));
82         if (source instanceof int[])
83             return(arrayEquals((int[]) source,target));
84         if (source instanceof double[])
85             return(arrayEquals((double[]) source, target));
86         if (source instanceof byte[])
87             return(arrayEquals((byte[]) source,target));
88         return source.equals(target);
89     }
90 
91     /**
92      * Convenience utility to compare two Object[]s
93      * Ought to be in System.
94      * @param len the length to compare.
95      * The start indices and start+len must be valid.
96      */
arrayRegionMatches(Object[] source, int sourceStart, Object[] target, int targetStart, int len)97     public final static boolean arrayRegionMatches(Object[] source, int sourceStart,
98             Object[] target, int targetStart,
99             int len)
100     {
101         int sourceEnd = sourceStart + len;
102         int delta = targetStart - sourceStart;
103         for (int i = sourceStart; i < sourceEnd; i++) {
104             if (!arrayEquals(source[i],target[i + delta]))
105                 return false;
106         }
107         return true;
108     }
109 
110     /**
111      * Convenience utility to compare two Object[]s
112      * Ought to be in System.
113      * @param len the length to compare.
114      * The start indices and start+len must be valid.
115      */
arrayRegionMatches(char[] source, int sourceStart, char[] target, int targetStart, int len)116     public final static boolean arrayRegionMatches(char[] source, int sourceStart,
117             char[] target, int targetStart,
118             int len)
119     {
120         int sourceEnd = sourceStart + len;
121         int delta = targetStart - sourceStart;
122         for (int i = sourceStart; i < sourceEnd; i++) {
123             if (source[i]!=target[i + delta])
124                 return false;
125         }
126         return true;
127     }
128 
129     /**
130      * Convenience utility to compare two int[]s.
131      * @param len the length to compare.
132      * The start indices and start+len must be valid.
133      * Ought to be in System
134      */
arrayRegionMatches(int[] source, int sourceStart, int[] target, int targetStart, int len)135     public final static boolean arrayRegionMatches(int[] source, int sourceStart,
136             int[] target, int targetStart,
137             int len)
138     {
139         int sourceEnd = sourceStart + len;
140         int delta = targetStart - sourceStart;
141         for (int i = sourceStart; i < sourceEnd; i++) {
142             if (source[i] != target[i + delta])
143                 return false;
144         }
145         return true;
146     }
147 
148     /**
149      * Convenience utility to compare two arrays of doubles.
150      * @param len the length to compare.
151      * The start indices and start+len must be valid.
152      * Ought to be in System
153      */
arrayRegionMatches(double[] source, int sourceStart, double[] target, int targetStart, int len)154     public final static boolean arrayRegionMatches(double[] source, int sourceStart,
155             double[] target, int targetStart,
156             int len)
157     {
158         int sourceEnd = sourceStart + len;
159         int delta = targetStart - sourceStart;
160         for (int i = sourceStart; i < sourceEnd; i++) {
161             if (source[i] != target[i + delta])
162                 return false;
163         }
164         return true;
165     }
arrayRegionMatches(byte[] source, int sourceStart, byte[] target, int targetStart, int len)166     public final static boolean arrayRegionMatches(byte[] source, int sourceStart,
167             byte[] target, int targetStart, int len){
168         int sourceEnd = sourceStart + len;
169         int delta = targetStart - sourceStart;
170         for (int i = sourceStart; i < sourceEnd; i++) {
171             if (source[i] != target[i + delta])
172                 return false;
173         }
174         return true;
175     }
176 
177     /**
178      * Trivial reference equality.
179      * This method should help document that we really want == not equals(),
180      * and to have a single place to suppress warnings from static analysis tools.
181      */
sameObjects(Object a, Object b)182     public static final boolean sameObjects(Object a, Object b) {
183         return a == b;
184     }
185 
186     /**
187      * Convenience utility. Does null checks on objects, then calls compare.
188      */
checkCompare(T a, T b)189     public static <T extends Comparable<T>> int checkCompare(T a, T b) {
190         return a == null ?
191                 b == null ? 0 : -1 :
192                     b == null ? 1 : a.compareTo(b);
193       }
194 
195     /**
196      * Convenience utility. Does null checks on object, then calls hashCode.
197      */
checkHash(Object a)198     public static int checkHash(Object a) {
199         return a == null ? 0 : a.hashCode();
200       }
201 
202     /**
203      * The ESCAPE character is used during run-length encoding.  It signals
204      * a run of identical chars.
205      */
206     private static final char ESCAPE = '\uA5A5';
207 
208     /**
209      * The ESCAPE_BYTE character is used during run-length encoding.  It signals
210      * a run of identical bytes.
211      */
212     static final byte ESCAPE_BYTE = (byte)0xA5;
213 
214     /**
215      * Construct a string representing an int array.  Use run-length encoding.
216      * A character represents itself, unless it is the ESCAPE character.  Then
217      * the following notations are possible:
218      *   ESCAPE ESCAPE   ESCAPE literal
219      *   ESCAPE n c      n instances of character c
220      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
221      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
222      * If we encounter a run where n == ESCAPE, we represent this as:
223      *   c ESCAPE n-1 c
224      * The ESCAPE value is chosen so as not to collide with commonly
225      * seen values.
226      */
arrayToRLEString(int[] a)227     static public final String arrayToRLEString(int[] a) {
228         StringBuilder buffer = new StringBuilder();
229 
230         appendInt(buffer, a.length);
231         int runValue = a[0];
232         int runLength = 1;
233         for (int i=1; i<a.length; ++i) {
234             int s = a[i];
235             if (s == runValue && runLength < 0xFFFF) {
236                 ++runLength;
237             } else {
238                 encodeRun(buffer, runValue, runLength);
239                 runValue = s;
240                 runLength = 1;
241             }
242         }
243         encodeRun(buffer, runValue, runLength);
244         return buffer.toString();
245     }
246 
247     /**
248      * Construct a string representing a short array.  Use run-length encoding.
249      * A character represents itself, unless it is the ESCAPE character.  Then
250      * the following notations are possible:
251      *   ESCAPE ESCAPE   ESCAPE literal
252      *   ESCAPE n c      n instances of character c
253      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
254      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
255      * If we encounter a run where n == ESCAPE, we represent this as:
256      *   c ESCAPE n-1 c
257      * The ESCAPE value is chosen so as not to collide with commonly
258      * seen values.
259      */
arrayToRLEString(short[] a)260     static public final String arrayToRLEString(short[] a) {
261         StringBuilder buffer = new StringBuilder();
262         // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]);
263         buffer.append((char) (a.length >> 16));
264         buffer.append((char) a.length);
265         short runValue = a[0];
266         int runLength = 1;
267         for (int i=1; i<a.length; ++i) {
268             short s = a[i];
269             if (s == runValue && runLength < 0xFFFF) ++runLength;
270             else {
271                 encodeRun(buffer, runValue, runLength);
272                 runValue = s;
273                 runLength = 1;
274             }
275         }
276         encodeRun(buffer, runValue, runLength);
277         return buffer.toString();
278     }
279 
280     /**
281      * Construct a string representing a char array.  Use run-length encoding.
282      * A character represents itself, unless it is the ESCAPE character.  Then
283      * the following notations are possible:
284      *   ESCAPE ESCAPE   ESCAPE literal
285      *   ESCAPE n c      n instances of character c
286      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
287      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
288      * If we encounter a run where n == ESCAPE, we represent this as:
289      *   c ESCAPE n-1 c
290      * The ESCAPE value is chosen so as not to collide with commonly
291      * seen values.
292      */
arrayToRLEString(char[] a)293     static public final String arrayToRLEString(char[] a) {
294         StringBuilder buffer = new StringBuilder();
295         buffer.append((char) (a.length >> 16));
296         buffer.append((char) a.length);
297         char runValue = a[0];
298         int runLength = 1;
299         for (int i=1; i<a.length; ++i) {
300             char s = a[i];
301             if (s == runValue && runLength < 0xFFFF) ++runLength;
302             else {
303                 encodeRun(buffer, (short)runValue, runLength);
304                 runValue = s;
305                 runLength = 1;
306             }
307         }
308         encodeRun(buffer, (short)runValue, runLength);
309         return buffer.toString();
310     }
311 
312     /**
313      * Construct a string representing a byte array.  Use run-length encoding.
314      * Two bytes are packed into a single char, with a single extra zero byte at
315      * the end if needed.  A byte represents itself, unless it is the
316      * ESCAPE_BYTE.  Then the following notations are possible:
317      *   ESCAPE_BYTE ESCAPE_BYTE   ESCAPE_BYTE literal
318      *   ESCAPE_BYTE n b           n instances of byte b
319      * Since an encoded run occupies 3 bytes, we only encode runs of 4 or
320      * more bytes.  Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF.
321      * If we encounter a run where n == ESCAPE_BYTE, we represent this as:
322      *   b ESCAPE_BYTE n-1 b
323      * The ESCAPE_BYTE value is chosen so as not to collide with commonly
324      * seen values.
325      */
arrayToRLEString(byte[] a)326     static public final String arrayToRLEString(byte[] a) {
327         StringBuilder buffer = new StringBuilder();
328         buffer.append((char) (a.length >> 16));
329         buffer.append((char) a.length);
330         byte runValue = a[0];
331         int runLength = 1;
332         byte[] state = new byte[2];
333         for (int i=1; i<a.length; ++i) {
334             byte b = a[i];
335             if (b == runValue && runLength < 0xFF) ++runLength;
336             else {
337                 encodeRun(buffer, runValue, runLength, state);
338                 runValue = b;
339                 runLength = 1;
340             }
341         }
342         encodeRun(buffer, runValue, runLength, state);
343 
344         // We must save the final byte, if there is one, by padding
345         // an extra zero.
346         if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state);
347 
348         return buffer.toString();
349     }
350 
351     /**
352      * Encode a run, possibly a degenerate run (of < 4 values).
353      * @param length The length of the run; must be > 0 && <= 0xFFFF.
354      */
encodeRun(T buffer, int value, int length)355     private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) {
356         if (length < 4) {
357             for (int j=0; j<length; ++j) {
358                 if (value == ESCAPE) {
359                     appendInt(buffer, value);
360                 }
361                 appendInt(buffer, value);
362             }
363         }
364         else {
365             if (length == ESCAPE) {
366                 if (value == ESCAPE) {
367                     appendInt(buffer, ESCAPE);
368                 }
369                 appendInt(buffer, value);
370                 --length;
371             }
372             appendInt(buffer, ESCAPE);
373             appendInt(buffer, length);
374             appendInt(buffer, value); // Don't need to escape this value
375         }
376     }
377 
appendInt(T buffer, int value)378     private static final <T extends Appendable> void appendInt(T buffer, int value) {
379         try {
380             buffer.append((char)(value >>> 16));
381             buffer.append((char)(value & 0xFFFF));
382         } catch (IOException e) {
383             throw new IllegalIcuArgumentException(e);
384         }
385     }
386 
387     /**
388      * Encode a run, possibly a degenerate run (of < 4 values).
389      * @param length The length of the run; must be > 0 && <= 0xFFFF.
390      */
encodeRun(T buffer, short value, int length)391     private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) {
392         try {
393             char valueChar = (char) value;
394             if (length < 4) {
395                 for (int j=0; j<length; ++j) {
396                     if (valueChar == ESCAPE) {
397                         buffer.append(ESCAPE);
398                     }
399                     buffer.append(valueChar);
400                 }
401             }
402             else {
403                 if (length == ESCAPE) {
404                     if (valueChar == ESCAPE) {
405                         buffer.append(ESCAPE);
406                     }
407                     buffer.append(valueChar);
408                     --length;
409                 }
410                 buffer.append(ESCAPE);
411                 buffer.append((char) length);
412                 buffer.append(valueChar); // Don't need to escape this value
413             }
414         } catch (IOException e) {
415             throw new IllegalIcuArgumentException(e);
416         }
417     }
418 
419     /**
420      * Encode a run, possibly a degenerate run (of < 4 values).
421      * @param length The length of the run; must be > 0 && <= 0xFF.
422      */
encodeRun(T buffer, byte value, int length, byte[] state)423     private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length,
424             byte[] state) {
425         if (length < 4) {
426             for (int j=0; j<length; ++j) {
427                 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
428                 appendEncodedByte(buffer, value, state);
429             }
430         }
431         else {
432             if ((byte)length == ESCAPE_BYTE) {
433                 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
434                 appendEncodedByte(buffer, value, state);
435                 --length;
436             }
437             appendEncodedByte(buffer, ESCAPE_BYTE, state);
438             appendEncodedByte(buffer, (byte)length, state);
439             appendEncodedByte(buffer, value, state); // Don't need to escape this value
440         }
441     }
442 
443     /**
444      * Append a byte to the given Appendable, packing two bytes into each
445      * character.  The state parameter maintains intermediary data between
446      * calls.
447      * @param state A two-element array, with state[0] == 0 if this is the
448      * first byte of a pair, or state[0] != 0 if this is the second byte
449      * of a pair, in which case state[1] is the first byte.
450      */
appendEncodedByte(T buffer, byte value, byte[] state)451     private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value,
452             byte[] state) {
453         try {
454             if (state[0] != 0) {
455                 char c = (char) ((state[1] << 8) | ((value) & 0xFF));
456                 buffer.append(c);
457                 state[0] = 0;
458             }
459             else {
460                 state[0] = 1;
461                 state[1] = value;
462             }
463         } catch (IOException e) {
464             throw new IllegalIcuArgumentException(e);
465         }
466     }
467 
468     /**
469      * Construct an array of ints from a run-length encoded string.
470      */
RLEStringToIntArray(String s)471     static public final int[] RLEStringToIntArray(String s) {
472         int length = getInt(s, 0);
473         int[] array = new int[length];
474         int ai = 0, i = 1;
475 
476         int maxI = s.length() / 2;
477         while (ai < length && i < maxI) {
478             int c = getInt(s, i++);
479 
480             if (c == ESCAPE) {
481                 c = getInt(s, i++);
482                 if (c == ESCAPE) {
483                     array[ai++] = c;
484                 } else {
485                     int runLength = c;
486                     int runValue = getInt(s, i++);
487                     for (int j=0; j<runLength; ++j) {
488                         array[ai++] = runValue;
489                     }
490                 }
491             }
492             else {
493                 array[ai++] = c;
494             }
495         }
496 
497         if (ai != length || i != maxI) {
498             throw new IllegalStateException("Bad run-length encoded int array");
499         }
500 
501         return array;
502     }
getInt(String s, int i)503     static final int getInt(String s, int i) {
504         return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1);
505     }
506 
507     /**
508      * Construct an array of shorts from a run-length encoded string.
509      */
RLEStringToShortArray(String s)510     static public final short[] RLEStringToShortArray(String s) {
511         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
512         short[] array = new short[length];
513         int ai = 0;
514         for (int i=2; i<s.length(); ++i) {
515             char c = s.charAt(i);
516             if (c == ESCAPE) {
517                 c = s.charAt(++i);
518                 if (c == ESCAPE) {
519                     array[ai++] = (short) c;
520                 } else {
521                     int runLength = c;
522                     short runValue = (short) s.charAt(++i);
523                     for (int j=0; j<runLength; ++j) array[ai++] = runValue;
524                 }
525             }
526             else {
527                 array[ai++] = (short) c;
528             }
529         }
530 
531         if (ai != length)
532             throw new IllegalStateException("Bad run-length encoded short array");
533 
534         return array;
535     }
536 
537     /**
538      * Construct an array of shorts from a run-length encoded string.
539      */
RLEStringToCharArray(String s)540     static public final char[] RLEStringToCharArray(String s) {
541         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
542         char[] array = new char[length];
543         int ai = 0;
544         for (int i=2; i<s.length(); ++i) {
545             char c = s.charAt(i);
546             if (c == ESCAPE) {
547                 c = s.charAt(++i);
548                 if (c == ESCAPE) {
549                     array[ai++] = c;
550                 } else {
551                     int runLength = c;
552                     char runValue = s.charAt(++i);
553                     for (int j=0; j<runLength; ++j) array[ai++] = runValue;
554                 }
555             }
556             else {
557                 array[ai++] = c;
558             }
559         }
560 
561         if (ai != length)
562             throw new IllegalStateException("Bad run-length encoded short array");
563 
564         return array;
565     }
566 
567     /**
568      * Construct an array of bytes from a run-length encoded string.
569      */
RLEStringToByteArray(String s)570     static public final byte[] RLEStringToByteArray(String s) {
571         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
572         byte[] array = new byte[length];
573         boolean nextChar = true;
574         char c = 0;
575         int node = 0;
576         int runLength = 0;
577         int i = 2;
578         for (int ai=0; ai<length; ) {
579             // This part of the loop places the next byte into the local
580             // variable 'b' each time through the loop.  It keeps the
581             // current character in 'c' and uses the boolean 'nextChar'
582             // to see if we've taken both bytes out of 'c' yet.
583             byte b;
584             if (nextChar) {
585                 c = s.charAt(i++);
586                 b = (byte) (c >> 8);
587                 nextChar = false;
588             }
589             else {
590                 b = (byte) (c & 0xFF);
591                 nextChar = true;
592             }
593 
594             // This part of the loop is a tiny state machine which handles
595             // the parsing of the run-length encoding.  This would be simpler
596             // if we could look ahead, but we can't, so we use 'node' to
597             // move between three nodes in the state machine.
598             switch (node) {
599             case 0:
600                 // Normal idle node
601                 if (b == ESCAPE_BYTE) {
602                     node = 1;
603                 }
604                 else {
605                     array[ai++] = b;
606                 }
607                 break;
608             case 1:
609                 // We have seen one ESCAPE_BYTE; we expect either a second
610                 // one, or a run length and value.
611                 if (b == ESCAPE_BYTE) {
612                     array[ai++] = ESCAPE_BYTE;
613                     node = 0;
614                 }
615                 else {
616                     runLength = b;
617                     // Interpret signed byte as unsigned
618                     if (runLength < 0) runLength += 0x100;
619                     node = 2;
620                 }
621                 break;
622             case 2:
623                 // We have seen an ESCAPE_BYTE and length byte.  We interpret
624                 // the next byte as the value to be repeated.
625                 for (int j=0; j<runLength; ++j) array[ai++] = b;
626                 node = 0;
627                 break;
628             }
629         }
630 
631         if (node != 0)
632             throw new IllegalStateException("Bad run-length encoded byte array");
633 
634         if (i != s.length())
635             throw new IllegalStateException("Excess data in RLE byte array string");
636 
637         return array;
638     }
639 
640     static public String LINE_SEPARATOR = System.getProperty("line.separator");
641 
642     /**
643      * Format a String for representation in a source file.  This includes
644      * breaking it into lines and escaping characters using octal notation
645      * when necessary (control characters and double quotes).
646      */
formatForSource(String s)647     static public final String formatForSource(String s) {
648         StringBuilder buffer = new StringBuilder();
649         for (int i=0; i<s.length();) {
650             if (i > 0) buffer.append('+').append(LINE_SEPARATOR);
651             buffer.append("        \"");
652             int count = 11;
653             while (i<s.length() && count<80) {
654                 char c = s.charAt(i++);
655                 if (c < '\u0020' || c == '"' || c == '\\') {
656                     if (c == '\n') {
657                         buffer.append("\\n");
658                         count += 2;
659                     } else if (c == '\t') {
660                         buffer.append("\\t");
661                         count += 2;
662                     } else if (c == '\r') {
663                         buffer.append("\\r");
664                         count += 2;
665                     } else {
666                         // Represent control characters, backslash and double quote
667                         // using octal notation; otherwise the string we form
668                         // won't compile, since Unicode escape sequences are
669                         // processed before tokenization.
670                         buffer.append('\\');
671                         buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
672                         buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
673                         buffer.append(HEX_DIGIT[(c & 0007)]);
674                         count += 4;
675                     }
676                 }
677                 else if (c <= '\u007E') {
678                     buffer.append(c);
679                     count += 1;
680                 }
681                 else {
682                     buffer.append("\\u");
683                     buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
684                     buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
685                     buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
686                     buffer.append(HEX_DIGIT[(c & 0x000F)]);
687                     count += 6;
688                 }
689             }
690             buffer.append('"');
691         }
692         return buffer.toString();
693     }
694 
695     static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7',
696         '8','9','A','B','C','D','E','F'};
697 
698     /**
699      * Format a String for representation in a source file.  Like
700      * formatForSource but does not do line breaking.
701      */
format1ForSource(String s)702     static public final String format1ForSource(String s) {
703         StringBuilder buffer = new StringBuilder();
704         buffer.append("\"");
705         for (int i=0; i<s.length();) {
706             char c = s.charAt(i++);
707             if (c < '\u0020' || c == '"' || c == '\\') {
708                 if (c == '\n') {
709                     buffer.append("\\n");
710                 } else if (c == '\t') {
711                     buffer.append("\\t");
712                 } else if (c == '\r') {
713                     buffer.append("\\r");
714                 } else {
715                     // Represent control characters, backslash and double quote
716                     // using octal notation; otherwise the string we form
717                     // won't compile, since Unicode escape sequences are
718                     // processed before tokenization.
719                     buffer.append('\\');
720                     buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
721                     buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
722                     buffer.append(HEX_DIGIT[(c & 0007)]);
723                 }
724             }
725             else if (c <= '\u007E') {
726                 buffer.append(c);
727             }
728             else {
729                 buffer.append("\\u");
730                 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
731                 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
732                 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
733                 buffer.append(HEX_DIGIT[(c & 0x000F)]);
734             }
735         }
736         buffer.append('"');
737         return buffer.toString();
738     }
739 
740     /**
741      * Convert characters outside the range U+0020 to U+007F to
742      * Unicode escapes, and convert backslash to a double backslash.
743      */
escape(String s)744     public static final String escape(String s) {
745         StringBuilder buf = new StringBuilder();
746         for (int i=0; i<s.length(); ) {
747             int c = Character.codePointAt(s, i);
748             i += UTF16.getCharCount(c);
749             if (c >= ' ' && c <= 0x007F) {
750                 if (c == '\\') {
751                     buf.append("\\\\"); // That is, "\\"
752                 } else {
753                     buf.append((char)c);
754                 }
755             } else {
756                 boolean four = c <= 0xFFFF;
757                 buf.append(four ? "\\u" : "\\U");
758                 buf.append(hex(c, four ? 4 : 8));
759             }
760         }
761         return buf.toString();
762     }
763 
764     /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
765     static private final char[] UNESCAPE_MAP = {
766         /*"   0x22, 0x22 */
767         /*'   0x27, 0x27 */
768         /*?   0x3F, 0x3F */
769         /*\   0x5C, 0x5C */
770         /*a*/ 0x61, 0x07,
771         /*b*/ 0x62, 0x08,
772         /*e*/ 0x65, 0x1b,
773         /*f*/ 0x66, 0x0c,
774         /*n*/ 0x6E, 0x0a,
775         /*r*/ 0x72, 0x0d,
776         /*t*/ 0x74, 0x09,
777         /*v*/ 0x76, 0x0b
778     };
779 
780     /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
_digit8(int c)781     private static final int _digit8(int c) {
782         if (c >= '0' && c <= '7') {
783             return c - '0';
784         }
785         return -1;
786     }
787 
788     /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
_digit16(int c)789     private static final int _digit16(int c) {
790         if (c >= '0' && c <= '9') {
791             return c - '0';
792         }
793         if (c >= 'A' && c <= 'F') {
794             return c - ('A' - 10);
795         }
796         if (c >= 'a' && c <= 'f') {
797             return c - ('a' - 10);
798         }
799         return -1;
800     }
801 
802     /**
803      * Converts an escape to a code point value. We attempt
804      * to parallel the icu4c unescapeAt() function.
805      * This function returns an integer with
806      * both the code point (bits 28..8) and the length of the escape sequence (bits 7..0).
807      * offset+length is the index after the escape sequence.
808      *
809      * @param offset the offset to the character <em>after</em> the backslash.
810      * @return the code point and length, or -1 on error.
811      */
unescapeAndLengthAt(CharSequence s, int offset)812     public static int unescapeAndLengthAt(CharSequence s, int offset) {
813         return unescapeAndLengthAt(s, offset, s.length());
814     }
815 
unescapeAndLengthAt(CharSequence s, int offset, int length)816     private static int unescapeAndLengthAt(CharSequence s, int offset, int length) {
817         int result = 0;
818         int n = 0;
819         int minDig = 0;
820         int maxDig = 0;
821         int bitsPerDigit = 4;
822         int dig;
823         boolean braces = false;
824 
825         /* Check that offset is in range */
826         if (offset < 0 || offset >= length) {
827             return -1;
828         }
829         int start = offset;
830 
831         /* Fetch first UChar after '\\' */
832         int c = s.charAt(offset++);
833 
834         /* Convert hexadecimal and octal escapes */
835         switch (c) {
836         case 'u':
837             minDig = maxDig = 4;
838             break;
839         case 'U':
840             minDig = maxDig = 8;
841             break;
842         case 'x':
843             minDig = 1;
844             if (offset < length && s.charAt(offset) == '{') {
845                 ++offset;
846                 braces = true;
847                 maxDig = 8;
848             } else {
849                 maxDig = 2;
850             }
851             break;
852         default:
853             dig = _digit8(c);
854             if (dig >= 0) {
855                 minDig = 1;
856                 maxDig = 3;
857                 n = 1; /* Already have first octal digit */
858                 bitsPerDigit = 3;
859                 result = dig;
860             }
861             break;
862         }
863         if (minDig != 0) {
864             while (offset < length && n < maxDig) {
865                 c = s.charAt(offset);
866                 dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
867                 if (dig < 0) {
868                     break;
869                 }
870                 result = (result << bitsPerDigit) | dig;
871                 ++offset;
872                 ++n;
873             }
874             if (n < minDig) {
875                 return -1;
876             }
877             if (braces) {
878                 if (c != '}') {
879                     return -1;
880                 }
881                 ++offset;
882             }
883             if (result < 0 || result >= 0x110000) {
884                 return -1;
885             }
886             // If an escape sequence specifies a lead surrogate, see
887             // if there is a trail surrogate after it, either as an
888             // escape or as a literal.  If so, join them up into a
889             // supplementary.
890             if (offset < length && UTF16.isLeadSurrogate(result)) {
891                 int ahead = offset+1;
892                 c = s.charAt(offset);
893                 if (c == '\\' && ahead < length) {
894                     // Calling ourselves recursively may cause a stack overflow if
895                     // we have repeated escaped lead surrogates.
896                     // Limit the length to 11 ("x{0000DFFF}") after ahead.
897                     int tailLimit = ahead + 11;
898                     if (tailLimit > length) {
899                         tailLimit = length;
900                     }
901                     int cpAndLength = unescapeAndLengthAt(s, ahead, tailLimit);
902                     if (cpAndLength >= 0) {
903                         c = cpAndLength >> 8;
904                         ahead += cpAndLength & 0xff;
905                     }
906                 }
907                 if (UTF16.isTrailSurrogate(c)) {
908                     offset = ahead;
909                     result = UCharacter.toCodePoint(result, c);
910                 }
911             }
912             return codePointAndLength(result, start, offset);
913         }
914 
915         /* Convert C-style escapes in table */
916         for (int i=0; i<UNESCAPE_MAP.length; i+=2) {
917             if (c == UNESCAPE_MAP[i]) {
918                 return codePointAndLength(UNESCAPE_MAP[i+1], start, offset);
919             } else if (c < UNESCAPE_MAP[i]) {
920                 break;
921             }
922         }
923 
924         /* Map \cX to control-X: X & 0x1F */
925         if (c == 'c' && offset < length) {
926             c = Character.codePointAt(s, offset);
927             return codePointAndLength(c & 0x1F, start, offset + Character.charCount(c));
928         }
929 
930         /* If no special forms are recognized, then consider
931          * the backslash to generically escape the next character.
932          * Deal with surrogate pairs. */
933         if (UTF16.isLeadSurrogate(c) && offset < length) {
934             int c2 = s.charAt(offset);
935             if (UTF16.isTrailSurrogate(c2)) {
936                 ++offset;
937                 c = UCharacter.toCodePoint(c, c2);
938             }
939         }
940         return codePointAndLength(c, start, offset);
941     }
942 
codePointAndLength(int c, int length)943     private static int codePointAndLength(int c, int length) {
944         assert 0 <= c && c <= 0x10ffff;
945         assert 0 <= length && length <= 0xff;
946         return c << 8 | length;
947     }
948 
codePointAndLength(int c, int start, int limit)949     private static int codePointAndLength(int c, int start, int limit) {
950         return codePointAndLength(c, limit - start);
951     }
952 
cpFromCodePointAndLength(int cpAndLength)953     public static int cpFromCodePointAndLength(int cpAndLength) {
954         assert cpAndLength >= 0;
955         return cpAndLength >> 8;
956     }
957 
lengthFromCodePointAndLength(int cpAndLength)958     public static int lengthFromCodePointAndLength(int cpAndLength) {
959         assert cpAndLength >= 0;
960         return cpAndLength & 0xff;
961     }
962 
963     /**
964      * Convert all escapes in a given string using unescapeAndLengthAt().
965      * @exception IllegalArgumentException if an invalid escape is
966      * seen.
967      */
unescape(CharSequence s)968     public static String unescape(CharSequence s) {
969         StringBuilder buf = null;
970         for (int i=0; i<s.length(); ) {
971             char c = s.charAt(i++);
972             if (c == '\\') {
973                 if (buf == null) {
974                     buf = new StringBuilder(s.length()).append(s, 0, i - 1);
975                 }
976                 int cpAndLength = unescapeAndLengthAt(s, i);
977                 if (cpAndLength < 0) {
978                     throw new IllegalArgumentException("Invalid escape sequence " +
979                             s.subSequence(i-1, Math.min(i+9, s.length())));
980                 }
981                 buf.appendCodePoint(cpAndLength >> 8);
982                 i += cpAndLength & 0xff;
983             } else if (buf != null) {
984                 // We could optimize this further by appending whole substrings between escapes.
985                 buf.append(c);
986             }
987         }
988         if (buf == null) {
989             // No escapes in s.
990             return s.toString();
991         }
992         return buf.toString();
993     }
994 
995     /**
996      * Convert all escapes in a given string using unescapeAndLengthAt().
997      * Leave invalid escape sequences unchanged.
998      */
unescapeLeniently(CharSequence s)999     public static String unescapeLeniently(CharSequence s) {
1000         StringBuilder buf = null;
1001         for (int i=0; i<s.length(); ) {
1002             char c = s.charAt(i++);
1003             if (c == '\\') {
1004                 if (buf == null) {
1005                     buf = new StringBuilder(s.length()).append(s, 0, i - 1);
1006                 }
1007                 int cpAndLength = unescapeAndLengthAt(s, i);
1008                 if (cpAndLength < 0) {
1009                     buf.append(c);
1010                 } else {
1011                     buf.appendCodePoint(cpAndLength >> 8);
1012                     i += cpAndLength & 0xff;
1013                 }
1014             } else if (buf != null) {
1015                 // We could optimize this further by appending whole substrings between escapes.
1016                 buf.append(c);
1017             }
1018         }
1019         if (buf == null) {
1020             // No escapes in s.
1021             return s.toString();
1022         }
1023         return buf.toString();
1024     }
1025 
1026     /**
1027      * Convert a char to 4 hex uppercase digits.  E.g., hex('a') =>
1028      * "0041".
1029      */
hex(long ch)1030     public static String hex(long ch) {
1031         return hex(ch, 4);
1032     }
1033 
1034     /**
1035      * Supplies a zero-padded hex representation of an integer (without 0x)
1036      */
hex(long i, int places)1037     static public String hex(long i, int places) {
1038         if (i == Long.MIN_VALUE) return "-8000000000000000";
1039         boolean negative = i < 0;
1040         if (negative) {
1041             i = -i;
1042         }
1043         String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
1044         if (result.length() < places) {
1045             result = "0000000000000000".substring(result.length(),places) + result;
1046         }
1047         if (negative) {
1048             return '-' + result;
1049         }
1050         return result;
1051     }
1052 
1053     /**
1054      * Convert a string to comma-separated groups of 4 hex uppercase
1055      * digits.  E.g., hex('ab') => "0041,0042".
1056      */
1057     public static String hex(CharSequence s) {
1058         return hex(s, 4, ",", true, new StringBuilder()).toString();
1059     }
1060 
1061     /**
1062      * Convert a string to separated groups of hex uppercase
1063      * digits.  E.g., hex('ab'...) => "0041,0042".  Append the output
1064      * to the given Appendable.
1065      */
1066     public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) {
1067         try {
1068             if (useCodePoints) {
1069                 int cp;
1070                 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1071                     cp = Character.codePointAt(s, i);
1072                     if (i != 0) {
1073                         result.append(separator);
1074                     }
1075                     result.append(hex(cp,width));
1076                 }
1077             } else {
1078                 for (int i = 0; i < s.length(); ++i) {
1079                     if (i != 0) {
1080                         result.append(separator);
1081                     }
1082                     result.append(hex(s.charAt(i),width));
1083                 }
1084             }
1085             return result;
1086         } catch (IOException e) {
1087             throw new IllegalIcuArgumentException(e);
1088         }
1089     }
1090 
1091     public static String hex(byte[] o, int start, int end, String separator) {
1092         StringBuilder result = new StringBuilder();
1093         //int ch;
1094         for (int i = start; i < end; ++i) {
1095           if (i != 0) result.append(separator);
1096           result.append(hex(o[i]));
1097         }
1098         return result.toString();
1099       }
1100 
1101     /**
1102      * Convert a string to comma-separated groups of 4 hex uppercase
1103      * digits.  E.g., hex('ab') => "0041,0042".
1104      */
1105     public static <S extends CharSequence> String hex(S s, int width, S separator) {
1106         return hex(s, width, separator, true, new StringBuilder()).toString();
1107     }
1108 
1109     /**
1110      * Split a string into pieces based on the given divider character
1111      * @param s the string to split
1112      * @param divider the character on which to split.  Occurrences of
1113      * this character are not included in the output
1114      * @param output an array to receive the substrings between
1115      * instances of divider.  It must be large enough on entry to
1116      * accommodate all output.  Adjacent instances of the divider
1117      * character will place empty strings into output.  Before
1118      * returning, output is padded out with empty strings.
1119      */
1120     public static void split(String s, char divider, String[] output) {
1121         int last = 0;
1122         int current = 0;
1123         int i;
1124         for (i = 0; i < s.length(); ++i) {
1125             if (s.charAt(i) == divider) {
1126                 output[current++] = s.substring(last,i);
1127                 last = i+1;
1128             }
1129         }
1130         output[current++] = s.substring(last,i);
1131         while (current < output.length) {
1132             output[current++] = "";
1133         }
1134     }
1135 
1136     /**
1137      * Split a string into pieces based on the given divider character
1138      * @param s the string to split
1139      * @param divider the character on which to split.  Occurrences of
1140      * this character are not included in the output
1141      * @return output an array to receive the substrings between
1142      * instances of divider. Adjacent instances of the divider
1143      * character will place empty strings into output.
1144      */
1145     public static String[] split(String s, char divider) {
1146         int last = 0;
1147         int i;
1148         ArrayList<String> output = new ArrayList<>();
1149         for (i = 0; i < s.length(); ++i) {
1150             if (s.charAt(i) == divider) {
1151                 output.add(s.substring(last,i));
1152                 last = i+1;
1153             }
1154         }
1155         output.add( s.substring(last,i));
1156         return output.toArray(new String[output.size()]);
1157     }
1158 
1159     /**
1160      * Look up a given string in a string array.  Returns the index at
1161      * which the first occurrence of the string was found in the
1162      * array, or -1 if it was not found.
1163      * @param source the string to search for
1164      * @param target the array of zero or more strings in which to
1165      * look for source
1166      * @return the index of target at which source first occurs, or -1
1167      * if not found
1168      */
1169     public static int lookup(String source, String[] target) {
1170         for (int i = 0; i < target.length; ++i) {
1171             if (source.equals(target[i])) return i;
1172         }
1173         return -1;
1174     }
1175 
1176     /**
1177      * Parse a single non-whitespace character 'ch', optionally
1178      * preceded by whitespace.
1179      * @param id the string to be parsed
1180      * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
1181      * offset of the first character to be parsed.  On output, pos[0]
1182      * is the index after the last parsed character.  If the parse
1183      * fails, pos[0] will be unchanged.
1184      * @param ch the non-whitespace character to be parsed.
1185      * @return true if 'ch' is seen preceded by zero or more
1186      * whitespace characters.
1187      */
1188     public static boolean parseChar(String id, int[] pos, char ch) {
1189         int start = pos[0];
1190         pos[0] = PatternProps.skipWhiteSpace(id, pos[0]);
1191         if (pos[0] == id.length() ||
1192                 id.charAt(pos[0]) != ch) {
1193             pos[0] = start;
1194             return false;
1195         }
1196         ++pos[0];
1197         return true;
1198     }
1199 
1200     /**
1201      * Parse a pattern string starting at offset pos.  Keywords are
1202      * matched case-insensitively.  Spaces may be skipped and may be
1203      * optional or required.  Integer values may be parsed, and if
1204      * they are, they will be returned in the given array.  If
1205      * successful, the offset of the next non-space character is
1206      * returned.  On failure, -1 is returned.
1207      * @param pattern must only contain lowercase characters, which
1208      * will match their uppercase equivalents as well.  A space
1209      * character matches one or more required spaces.  A '~' character
1210      * matches zero or more optional spaces.  A '#' character matches
1211      * an integer and stores it in parsedInts, which the caller must
1212      * ensure has enough capacity.
1213      * @param parsedInts array to receive parsed integers.  Caller
1214      * must ensure that parsedInts.length is >= the number of '#'
1215      * signs in 'pattern'.
1216      * @return the position after the last character parsed, or -1 if
1217      * the parse failed
1218      */
1219     @SuppressWarnings("fallthrough")
1220     public static int parsePattern(String rule, int pos, int limit,
1221             String pattern, int[] parsedInts) {
1222         // TODO Update this to handle surrogates
1223         int[] p = new int[1];
1224         int intCount = 0; // number of integers parsed
1225         for (int i=0; i<pattern.length(); ++i) {
1226             char cpat = pattern.charAt(i);
1227             char c;
1228             switch (cpat) {
1229             case ' ':
1230                 if (pos >= limit) {
1231                     return -1;
1232                 }
1233                 c = rule.charAt(pos++);
1234                 if (!PatternProps.isWhiteSpace(c)) {
1235                     return -1;
1236                 }
1237                 // FALL THROUGH to skipWhitespace
1238             case '~':
1239                 pos = PatternProps.skipWhiteSpace(rule, pos);
1240                 break;
1241             case '#':
1242                 p[0] = pos;
1243                 parsedInts[intCount++] = parseInteger(rule, p, limit);
1244                 if (p[0] == pos) {
1245                     // Syntax error; failed to parse integer
1246                     return -1;
1247                 }
1248                 pos = p[0];
1249                 break;
1250             default:
1251                 if (pos >= limit) {
1252                     return -1;
1253                 }
1254                 c = (char) UCharacter.toLowerCase(rule.charAt(pos++));
1255                 if (c != cpat) {
1256                     return -1;
1257                 }
1258                 break;
1259             }
1260         }
1261         return pos;
1262     }
1263 
1264     /**
1265      * Parse a pattern string within the given Replaceable and a parsing
1266      * pattern.  Characters are matched literally and case-sensitively
1267      * except for the following special characters:
1268      *
1269      * ~  zero or more Pattern_White_Space chars
1270      *
1271      * If end of pattern is reached with all matches along the way,
1272      * pos is advanced to the first unparsed index and returned.
1273      * Otherwise -1 is returned.
1274      * @param pat pattern that controls parsing
1275      * @param text text to be parsed, starting at index
1276      * @param index offset to first character to parse
1277      * @param limit offset after last character to parse
1278      * @return index after last parsed character, or -1 on parse failure.
1279      */
1280     public static int parsePattern(String pat,
1281             Replaceable text,
1282             int index,
1283             int limit) {
1284         int ipat = 0;
1285 
1286         // empty pattern matches immediately
1287         if (ipat == pat.length()) {
1288             return index;
1289         }
1290 
1291         int cpat = Character.codePointAt(pat, ipat);
1292 
1293         while (index < limit) {
1294             int c = text.char32At(index);
1295 
1296             // parse \s*
1297             if (cpat == '~') {
1298                 if (PatternProps.isWhiteSpace(c)) {
1299                     index += UTF16.getCharCount(c);
1300                     continue;
1301                 } else {
1302                     if (++ipat == pat.length()) {
1303                         return index; // success; c unparsed
1304                     }
1305                     // fall thru; process c again with next cpat
1306                 }
1307             }
1308 
1309             // parse literal
1310             else if (c == cpat) {
1311                 int n = UTF16.getCharCount(c);
1312                 index += n;
1313                 ipat += n;
1314                 if (ipat == pat.length()) {
1315                     return index; // success; c parsed
1316                 }
1317                 // fall thru; get next cpat
1318             }
1319 
1320             // match failure of literal
1321             else {
1322                 return -1;
1323             }
1324 
1325             cpat = UTF16.charAt(pat, ipat);
1326         }
1327 
1328         return -1; // text ended before end of pat
1329     }
1330 
1331     /**
1332      * Parse an integer at pos, either of the form \d+ or of the form
1333      * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
1334      * or octal format.
1335      * @param pos INPUT-OUTPUT parameter.  On input, the first
1336      * character to parse.  On output, the character after the last
1337      * parsed character.
1338      */
1339     public static int parseInteger(String rule, int[] pos, int limit) {
1340         int count = 0;
1341         int value = 0;
1342         int p = pos[0];
1343         int radix = 10;
1344 
1345         if (rule.regionMatches(true, p, "0x", 0, 2)) {
1346             p += 2;
1347             radix = 16;
1348         } else if (p < limit && rule.charAt(p) == '0') {
1349             p++;
1350             count = 1;
1351             radix = 8;
1352         }
1353 
1354         while (p < limit) {
1355             int d = UCharacter.digit(rule.charAt(p++), radix);
1356             if (d < 0) {
1357                 --p;
1358                 break;
1359             }
1360             ++count;
1361             int v = (value * radix) + d;
1362             if (v <= value) {
1363                 // If there are too many input digits, at some point
1364                 // the value will go negative, e.g., if we have seen
1365                 // "0x8000000" already and there is another '0', when
1366                 // we parse the next 0 the value will go negative.
1367                 return 0;
1368             }
1369             value = v;
1370         }
1371         if (count > 0) {
1372             pos[0] = p;
1373         }
1374         return value;
1375     }
1376 
1377     /**
1378      * Parse a Unicode identifier from the given string at the given
1379      * position.  Return the identifier, or null if there is no
1380      * identifier.
1381      * @param str the string to parse
1382      * @param pos INPUT-OUTPUT parameter.  On INPUT, pos[0] is the
1383      * first character to examine.  It must be less than str.length(),
1384      * and it must not point to a whitespace character.  That is, must
1385      * have pos[0] < str.length().  On
1386      * OUTPUT, the position after the last parsed character.
1387      * @return the Unicode identifier, or null if there is no valid
1388      * identifier at pos[0].
1389      */
1390     public static String parseUnicodeIdentifier(String str, int[] pos) {
1391         // assert(pos[0] < str.length());
1392         StringBuilder buf = new StringBuilder();
1393         int p = pos[0];
1394         while (p < str.length()) {
1395             int ch = Character.codePointAt(str, p);
1396             if (buf.length() == 0) {
1397                 if (UCharacter.isUnicodeIdentifierStart(ch)) {
1398                     buf.appendCodePoint(ch);
1399                 } else {
1400                     return null;
1401                 }
1402             } else {
1403                 if (UCharacter.isUnicodeIdentifierPart(ch)) {
1404                     buf.appendCodePoint(ch);
1405                 } else {
1406                     break;
1407                 }
1408             }
1409             p += UTF16.getCharCount(ch);
1410         }
1411         pos[0] = p;
1412         return buf.toString();
1413     }
1414 
1415     static final char DIGITS[] = {
1416         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
1417         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
1418         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
1419         'U', 'V', 'W', 'X', 'Y', 'Z'
1420     };
1421 
1422     /**
1423      * Append the digits of a positive integer to the given
1424      * <code>Appendable</code> in the given radix. This is
1425      * done recursively since it is easiest to generate the low-
1426      * order digit first, but it must be appended last.
1427      *
1428      * @param result is the <code>Appendable</code> to append to
1429      * @param n is the positive integer
1430      * @param radix is the radix, from 2 to 36 inclusive
1431      * @param minDigits is the minimum number of digits to append.
1432      */
1433     private static <T extends Appendable> void recursiveAppendNumber(T result, int n,
1434             int radix, int minDigits)
1435     {
1436         try {
1437             int digit = n % radix;
1438 
1439             if (n >= radix || minDigits > 1) {
1440                 recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
1441             }
1442             result.append(DIGITS[digit]);
1443         } catch (IOException e) {
1444             throw new IllegalIcuArgumentException(e);
1445         }
1446     }
1447 
1448     /**
1449      * Append a number to the given Appendable in the given radix.
1450      * Standard digits '0'-'9' are used and letters 'A'-'Z' for
1451      * radices 11 through 36.
1452      * @param result the digits of the number are appended here
1453      * @param n the number to be converted to digits; may be negative.
1454      * If negative, a '-' is prepended to the digits.
1455      * @param radix a radix from 2 to 36 inclusive.
1456      * @param minDigits the minimum number of digits, not including
1457      * any '-', to produce.  Values less than 2 have no effect.  One
1458      * digit is always emitted regardless of this parameter.
1459      * @return a reference to result
1460      */
1461     public static <T extends Appendable> T appendNumber(T result, int n,
1462             int radix, int minDigits)
1463     {
1464         try {
1465             if (radix < 2 || radix > 36) {
1466                 throw new IllegalArgumentException("Illegal radix " + radix);
1467             }
1468 
1469 
1470             int abs = n;
1471 
1472             if (n < 0) {
1473                 abs = -n;
1474                 result.append("-");
1475             }
1476 
1477             recursiveAppendNumber(result, abs, radix, minDigits);
1478 
1479             return result;
1480         } catch (IOException e) {
1481             throw new IllegalIcuArgumentException(e);
1482         }
1483 
1484     }
1485 
1486     /**
1487      * Parse an unsigned 31-bit integer at the given offset.  Use
1488      * UCharacter.digit() to parse individual characters into digits.
1489      * @param text the text to be parsed
1490      * @param pos INPUT-OUTPUT parameter.  On entry, pos[0] is the
1491      * offset within text at which to start parsing; it should point
1492      * to a valid digit.  On exit, pos[0] is the offset after the last
1493      * parsed character.  If the parse failed, it will be unchanged on
1494      * exit.  Must be >= 0 on entry.
1495      * @param radix the radix in which to parse; must be >= 2 and <=
1496      * 36.
1497      * @return a non-negative parsed number, or -1 upon parse failure.
1498      * Parse fails if there are no digits, that is, if pos[0] does not
1499      * point to a valid digit on entry, or if the number to be parsed
1500      * does not fit into a 31-bit unsigned integer.
1501      */
1502     public static int parseNumber(String text, int[] pos, int radix) {
1503         // assert(pos[0] >= 0);
1504         // assert(radix >= 2);
1505         // assert(radix <= 36);
1506         int n = 0;
1507         int p = pos[0];
1508         while (p < text.length()) {
1509             int ch = Character.codePointAt(text, p);
1510             int d = UCharacter.digit(ch, radix);
1511             if (d < 0) {
1512                 break;
1513             }
1514             n = radix*n + d;
1515             // ASSUME that when a 32-bit integer overflows it becomes
1516             // negative.  E.g., 214748364 * 10 + 8 => negative value.
1517             if (n < 0) {
1518                 return -1;
1519             }
1520             ++p;
1521         }
1522         if (p == pos[0]) {
1523             return -1;
1524         }
1525         pos[0] = p;
1526         return n;
1527     }
1528 
1529     /**
1530      * Return true if the character is NOT printable ASCII.  The tab,
1531      * newline and linefeed characters are considered unprintable.
1532      */
1533     public static boolean isUnprintable(int c) {
1534         //0x20 = 32 and 0x7E = 126
1535         return !(c >= 0x20 && c <= 0x7E);
1536     }
1537 
1538     /**
1539      * @return true for control codes and for surrogate and noncharacter code points
1540      */
1541     public static boolean shouldAlwaysBeEscaped(int c) {
1542         if (c < 0x20) {
1543             return true;  // C0 control codes
1544         } else if (c <= 0x7e) {
1545             return false;  // printable ASCII
1546         } else if (c <= 0x9f) {
1547             return true;  // C1 control codes
1548         } else if (c < 0xd800) {
1549             return false;  // most of the BMP
1550         } else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) {
1551             return true;  // surrogate or noncharacter code points
1552         } else if (c <= 0x10ffff) {
1553             return false;  // all else
1554         } else {
1555             return true;  // not a code point
1556         }
1557     }
1558 
1559     /**
1560      * Escapes one unprintable code point using <backslash>uxxxx notation
1561      * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
1562      * above.  If the character is printable ASCII, then do nothing
1563      * and return false.  Otherwise, append the escaped notation and
1564      * return true.
1565      */
1566     public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
1567         if (isUnprintable(c)) {
1568             escape(result, c);
1569             return true;
1570         }
1571         return false;
1572     }
1573 
1574     /**
1575      * Escapes one code point using <backslash>uxxxx notation
1576      * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and above.
1577      * @return result
1578      */
1579     public static <T extends Appendable> T escape(T result, int c) {
1580         try {
1581             result.append('\\');
1582             if ((c & ~0xFFFF) != 0) {
1583                 result.append('U');
1584                 result.append(DIGITS[0xF&(c>>28)]);
1585                 result.append(DIGITS[0xF&(c>>24)]);
1586                 result.append(DIGITS[0xF&(c>>20)]);
1587                 result.append(DIGITS[0xF&(c>>16)]);
1588             } else {
1589                 result.append('u');
1590             }
1591             result.append(DIGITS[0xF&(c>>12)]);
1592             result.append(DIGITS[0xF&(c>>8)]);
1593             result.append(DIGITS[0xF&(c>>4)]);
1594             result.append(DIGITS[0xF&c]);
1595             return result;
1596         } catch (IOException e) {
1597             throw new ICUUncheckedIOException(e);
1598         }
1599     }
1600 
1601     /**
1602      * Returns the index of the first character in a set, ignoring quoted text.
1603      * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
1604      * found by a search for "h".  Unlike String.indexOf(), this method searches
1605      * not for a single character, but for any character of the string
1606      * <code>setOfChars</code>.
1607      * @param text text to be searched
1608      * @param start the beginning index, inclusive; <code>0 <= start
1609      * <= limit</code>.
1610      * @param limit the ending index, exclusive; <code>start <= limit
1611      * <= text.length()</code>.
1612      * @param setOfChars string with one or more distinct characters
1613      * @return Offset of the first character in <code>setOfChars</code>
1614      * found, or -1 if not found.
1615      * @see String#indexOf
1616      */
1617     public static int quotedIndexOf(String text, int start, int limit,
1618             String setOfChars) {
1619         for (int i=start; i<limit; ++i) {
1620             char c = text.charAt(i);
1621             if (c == BACKSLASH) {
1622                 ++i;
1623             } else if (c == APOSTROPHE) {
1624                 while (++i < limit
1625                         && text.charAt(i) != APOSTROPHE) {}
1626             } else if (setOfChars.indexOf(c) >= 0) {
1627                 return i;
1628             }
1629         }
1630         return -1;
1631     }
1632 
1633     /**
1634      * Append a character to a rule that is being built up.  To flush
1635      * the quoteBuf to rule, make one final call with isLiteral == true.
1636      * If there is no final character, pass in (int)-1 as c.
1637      * @param rule the string to append the character to
1638      * @param c the character to append, or (int)-1 if none.
1639      * @param isLiteral if true, then the given character should not be
1640      * quoted or escaped.  Usually this means it is a syntactic element
1641      * such as > or $
1642      * @param escapeUnprintable if true, then unprintable characters
1643      * should be escaped using escapeUnprintable().  These escapes will
1644      * appear outside of quotes.
1645      * @param quoteBuf a buffer which is used to build up quoted
1646      * substrings.  The caller should initially supply an empty buffer,
1647      * and thereafter should not modify the buffer.  The buffer should be
1648      * cleared out by, at the end, calling this method with a literal
1649      * character (which may be -1).
1650      */
1651     public static void appendToRule(StringBuffer rule,
1652             int c,
1653             boolean isLiteral,
1654             boolean escapeUnprintable,
1655             StringBuffer quoteBuf) {
1656         // If we are escaping unprintables, then escape them outside
1657         // quotes.  \\u and \\U are not recognized within quotes.  The same
1658         // logic applies to literals, but literals are never escaped.
1659         if (isLiteral ||
1660                 (escapeUnprintable && Utility.isUnprintable(c))) {
1661             if (quoteBuf.length() > 0) {
1662                 // We prefer backslash APOSTROPHE to double APOSTROPHE
1663                 // (more readable, less similar to ") so if there are
1664                 // double APOSTROPHEs at the ends, we pull them outside
1665                 // of the quote.
1666 
1667                 // If the first thing in the quoteBuf is APOSTROPHE
1668                 // (doubled) then pull it out.
1669                 while (quoteBuf.length() >= 2 &&
1670                         quoteBuf.charAt(0) == APOSTROPHE &&
1671                         quoteBuf.charAt(1) == APOSTROPHE) {
1672                     rule.append(BACKSLASH).append(APOSTROPHE);
1673                     quoteBuf.delete(0, 2);
1674                 }
1675                 // If the last thing in the quoteBuf is APOSTROPHE
1676                 // (doubled) then remove and count it and add it after.
1677                 int trailingCount = 0;
1678                 while (quoteBuf.length() >= 2 &&
1679                         quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
1680                         quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
1681                     quoteBuf.setLength(quoteBuf.length()-2);
1682                     ++trailingCount;
1683                 }
1684                 if (quoteBuf.length() > 0) {
1685                     rule.append(APOSTROPHE);
1686                     rule.append(quoteBuf);
1687                     rule.append(APOSTROPHE);
1688                     quoteBuf.setLength(0);
1689                 }
1690                 while (trailingCount-- > 0) {
1691                     rule.append(BACKSLASH).append(APOSTROPHE);
1692                 }
1693             }
1694             if (c != -1) {
1695                 /* Since spaces are ignored during parsing, they are
1696                  * emitted only for readability.  We emit one here
1697                  * only if there isn't already one at the end of the
1698                  * rule.
1699                  */
1700                 if (c == ' ') {
1701                     int len = rule.length();
1702                     if (len > 0 && rule.charAt(len-1) != ' ') {
1703                         rule.append(' ');
1704                     }
1705                 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) {
1706                     rule.appendCodePoint(c);
1707                 }
1708             }
1709         }
1710 
1711         // Escape ' and '\' and don't begin a quote just for them
1712         else if (quoteBuf.length() == 0 &&
1713                 (c == APOSTROPHE || c == BACKSLASH)) {
1714             rule.append(BACKSLASH).append((char)c);
1715         }
1716 
1717         // Specials (printable ascii that isn't [0-9a-zA-Z]) and
1718         // whitespace need quoting.  Also append stuff to quotes if we are
1719         // building up a quoted substring already.
1720         else if (quoteBuf.length() > 0 ||
1721                 (c >= 0x0021 && c <= 0x007E &&
1722                         !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
1723                                 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
1724                                 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
1725                                 PatternProps.isWhiteSpace(c)) {
1726             quoteBuf.appendCodePoint(c);
1727             // Double ' within a quote
1728             if (c == APOSTROPHE) {
1729                 quoteBuf.append((char)c);
1730             }
1731         }
1732 
1733         // Otherwise just append
1734         else {
1735             rule.appendCodePoint(c);
1736         }
1737     }
1738 
1739     /**
1740      * Append the given string to the rule.  Calls the single-character
1741      * version of appendToRule for each character.
1742      */
1743     public static void appendToRule(StringBuffer rule,
1744             String text,
1745             boolean isLiteral,
1746             boolean escapeUnprintable,
1747             StringBuffer quoteBuf) {
1748         for (int i=0; i<text.length(); ++i) {
1749             // Okay to process in 16-bit code units here
1750             appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
1751         }
1752     }
1753 
1754     /**
1755      * Given a matcher reference, which may be null, append its
1756      * pattern as a literal to the given rule.
1757      */
1758     public static void appendToRule(StringBuffer rule,
1759             UnicodeMatcher matcher,
1760             boolean escapeUnprintable,
1761             StringBuffer quoteBuf) {
1762         if (matcher != null) {
1763             appendToRule(rule, matcher.toPattern(escapeUnprintable),
1764                     true, escapeUnprintable, quoteBuf);
1765         }
1766     }
1767 
1768     /**
1769      * Compares 2 unsigned integers
1770      * @param source 32 bit unsigned integer
1771      * @param target 32 bit unsigned integer
1772      * @return 0 if equals, 1 if source is greater than target and -1
1773      *         otherwise
1774      */
1775     public static final int compareUnsigned(int source, int target)
1776     {
1777         source += MAGIC_UNSIGNED;
1778         target += MAGIC_UNSIGNED;
1779         if (source < target) {
1780             return -1;
1781         }
1782         else if (source > target) {
1783             return 1;
1784         }
1785         return 0;
1786     }
1787 
1788     /**
1789      * Find the highest bit in a positive integer. This is done
1790      * by doing a binary search through the bits.
1791      *
1792      * @param n is the integer
1793      *
1794      * @return the bit number of the highest bit, with 0 being
1795      * the low order bit, or -1 if <code>n</code> is not positive
1796      */
1797     public static final byte highBit(int n)
1798     {
1799         if (n <= 0) {
1800             return -1;
1801         }
1802 
1803         byte bit = 0;
1804 
1805         if (n >= 1 << 16) {
1806             n >>= 16;
1807         bit += 16;
1808         }
1809 
1810         if (n >= 1 << 8) {
1811             n >>= 8;
1812         bit += 8;
1813         }
1814 
1815         if (n >= 1 << 4) {
1816             n >>= 4;
1817         bit += 4;
1818         }
1819 
1820         if (n >= 1 << 2) {
1821             n >>= 2;
1822         bit += 2;
1823         }
1824 
1825         if (n >= 1 << 1) {
1826             n >>= 1;
1827         bit += 1;
1828         }
1829 
1830         return bit;
1831     }
1832     /**
1833      * Utility method to take a int[] containing codepoints and return
1834      * a string representation with code units.
1835      */
valueOf(int[]source)1836     public static String valueOf(int[]source){
1837         // TODO: Investigate why this method is not on UTF16 class
1838         StringBuilder result = new StringBuilder(source.length);
1839         for(int i=0; i<source.length; i++){
1840             result.appendCodePoint(source[i]);
1841         }
1842         return result.toString();
1843     }
1844 
1845 
1846     /**
1847      * Utility to duplicate a string count times
1848      * @param s String to be duplicated.
1849      * @param count Number of times to duplicate a string.
1850      */
repeat(String s, int count)1851     public static String repeat(String s, int count) {
1852         if (count <= 0) return "";
1853         if (count == 1) return s;
1854         StringBuilder result = new StringBuilder();
1855         for (int i = 0; i < count; ++i) {
1856             result.append(s);
1857         }
1858         return result.toString();
1859     }
1860 
splitString(String src, String target)1861     public static String[] splitString(String src, String target) {
1862         return src.split("\\Q" + target + "\\E");
1863     }
1864 
1865     /**
1866      * Split the string at runs of ascii whitespace characters.
1867      */
splitWhitespace(String src)1868     public static String[] splitWhitespace(String src) {
1869         return src.split("\\s+");
1870     }
1871 
1872     /**
1873      * Parse a list of hex numbers and return a string
1874      * @param string String of hex numbers.
1875      * @param minLength Minimal length.
1876      * @param separator Separator.
1877      * @return A string from hex numbers.
1878      */
fromHex(String string, int minLength, String separator)1879     public static String fromHex(String string, int minLength, String separator) {
1880         return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+"));
1881     }
1882 
1883     /**
1884      * Parse a list of hex numbers and return a string
1885      * @param string String of hex numbers.
1886      * @param minLength Minimal length.
1887      * @param separator Separator.
1888      * @return A string from hex numbers.
1889      */
fromHex(String string, int minLength, Pattern separator)1890     public static String fromHex(String string, int minLength, Pattern separator) {
1891         StringBuilder buffer = new StringBuilder();
1892         String[] parts = separator.split(string);
1893         for (String part : parts) {
1894             if (part.length() < minLength) {
1895                 throw new IllegalArgumentException("code point too short: " + part);
1896             }
1897             int cp = Integer.parseInt(part, 16);
1898             buffer.appendCodePoint(cp);
1899         }
1900         return buffer.toString();
1901     }
1902 
1903     /**
1904      * This implementation is equivalent to Java 8+ Math#addExact(int, int)
1905      * @param x the first value
1906      * @param y the second value
1907      * @return the result
1908      */
addExact(int x, int y)1909     public static int addExact(int x, int y) {
1910         int r = x + y;
1911         // HD 2-12 Overflow iff both arguments have the opposite sign of the result
1912         if (((x ^ r) & (y ^ r)) < 0) {
1913             throw new ArithmeticException("integer overflow");
1914         }
1915         return r;
1916     }
1917 
1918     /**
1919      * Returns whether the chars in the two CharSequences are equal.
1920      */
charSequenceEquals(CharSequence a, CharSequence b)1921     public static boolean charSequenceEquals(CharSequence a, CharSequence b) {
1922         if (a == b) {
1923             return true;
1924         }
1925         if (a == null || b == null) {
1926             return false;
1927         }
1928         if (a.length() != b.length()) {
1929             return false;
1930         }
1931         for (int i = 0; i < a.length(); i++) {
1932             if (a.charAt(i) != b.charAt(i))
1933                 return false;
1934         }
1935         return true;
1936     }
1937 
1938     /**
1939      * Returns a hash code for a CharSequence that is equivalent to calling
1940      * charSequence.toString().hashCode()
1941      */
charSequenceHashCode(CharSequence value)1942     public static int charSequenceHashCode(CharSequence value) {
1943         int hash = 0;
1944         for (int i = 0; i < value.length(); i++) {
1945             hash = hash * 31 + value.charAt(i);
1946         }
1947         return hash;
1948     }
1949 
1950     /**
1951      * Appends a CharSequence to an Appendable, converting IOException to ICUUncheckedIOException.
1952      */
appendTo(CharSequence string, A appendable)1953     public static <A extends Appendable> A appendTo(CharSequence string, A appendable) {
1954         try {
1955             appendable.append(string);
1956             return appendable;
1957         } catch (IOException e) {
1958             throw new ICUUncheckedIOException(e);
1959         }
1960     }
1961 
1962     /**
1963      * Java 8+ String#join(CharSequence, Iterable<? extends CharSequence>) compatible method for Java 7 env.
1964      * @param delimiter the delimiter that separates each element
1965      * @param elements the elements to join together.
1966      * @return a new String that is composed of the elements separated by the delimiter
1967      * @throws NullPointerException If delimiter or elements is null
1968      */
joinStrings(CharSequence delimiter, Iterable<? extends CharSequence> elements)1969     public static String joinStrings(CharSequence delimiter, Iterable<? extends CharSequence> elements) {
1970         if (delimiter == null || elements == null) {
1971             throw new NullPointerException("Delimiter or elements is null");
1972         }
1973         StringBuilder buf = new StringBuilder();
1974         Iterator<? extends CharSequence> itr = elements.iterator();
1975         boolean isFirstElem = true;
1976         while (itr.hasNext()) {
1977             CharSequence element = itr.next();
1978             if (element != null) {
1979                 if (!isFirstElem) {
1980                     buf.append(delimiter);
1981                 } else {
1982                     isFirstElem = false;
1983                 }
1984                 buf.append(element);
1985             }
1986         }
1987         return buf.toString();
1988     }
1989 }
1990