• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 1996-2015, International Business Machines Corporation and    *
6  * others. All Rights Reserved.                                                *
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import java.io.IOException;
12 import java.util.ArrayList;
13 import java.util.Locale;
14 import java.util.regex.Pattern;
15 
16 import com.ibm.icu.lang.UCharacter;
17 import com.ibm.icu.text.Replaceable;
18 import com.ibm.icu.text.UTF16;
19 import com.ibm.icu.text.UnicodeMatcher;
20 
21 public final class Utility {
22 
23     private static final char APOSTROPHE = '\'';
24     private static final char BACKSLASH  = '\\';
25     private static final int MAGIC_UNSIGNED = 0x80000000;
26 
27     /**
28      * Convenience utility to compare two Object[]s.
29      * Ought to be in System
30      */
arrayEquals(Object[] source, Object target)31     public final static boolean arrayEquals(Object[] source, Object target) {
32         if (source == null) return (target == null);
33         if (!(target instanceof Object[])) return false;
34         Object[] targ = (Object[]) target;
35         return (source.length == targ.length
36                 && arrayRegionMatches(source, 0, targ, 0, source.length));
37     }
38 
39     /**
40      * Convenience utility to compare two int[]s
41      * Ought to be in System
42      */
arrayEquals(int[] source, Object target)43     public final static boolean arrayEquals(int[] source, Object target) {
44         if (source == null) return (target == null);
45         if (!(target instanceof int[])) return false;
46         int[] targ = (int[]) target;
47         return (source.length == targ.length
48                 && arrayRegionMatches(source, 0, targ, 0, source.length));
49     }
50 
51     /**
52      * Convenience utility to compare two double[]s
53      * Ought to be in System
54      */
arrayEquals(double[] source, Object target)55     public final static boolean arrayEquals(double[] source, Object target) {
56         if (source == null) return (target == null);
57         if (!(target instanceof double[])) return false;
58         double[] targ = (double[]) target;
59         return (source.length == targ.length
60                 && arrayRegionMatches(source, 0, targ, 0, source.length));
61     }
arrayEquals(byte[] source, Object target)62     public final static boolean arrayEquals(byte[] source, Object target) {
63         if (source == null) return (target == null);
64         if (!(target instanceof byte[])) return false;
65         byte[] targ = (byte[]) target;
66         return (source.length == targ.length
67                 && arrayRegionMatches(source, 0, targ, 0, source.length));
68     }
69 
70     /**
71      * Convenience utility to compare two Object[]s
72      * Ought to be in System
73      */
arrayEquals(Object source, Object target)74     public final static boolean arrayEquals(Object source, Object target) {
75         if (source == null) return (target == null);
76         // for some reason, the correct arrayEquals is not being called
77         // so do it by hand for now.
78         if (source instanceof Object[])
79             return(arrayEquals((Object[]) source,target));
80         if (source instanceof int[])
81             return(arrayEquals((int[]) source,target));
82         if (source instanceof double[])
83             return(arrayEquals((double[]) source, target));
84         if (source instanceof byte[])
85             return(arrayEquals((byte[]) source,target));
86         return source.equals(target);
87     }
88 
89     /**
90      * Convenience utility to compare two Object[]s
91      * Ought to be in System.
92      * @param len the length to compare.
93      * The start indices and start+len must be valid.
94      */
arrayRegionMatches(Object[] source, int sourceStart, Object[] target, int targetStart, int len)95     public final static boolean arrayRegionMatches(Object[] source, int sourceStart,
96             Object[] target, int targetStart,
97             int len)
98     {
99         int sourceEnd = sourceStart + len;
100         int delta = targetStart - sourceStart;
101         for (int i = sourceStart; i < sourceEnd; i++) {
102             if (!arrayEquals(source[i],target[i + delta]))
103                 return false;
104         }
105         return true;
106     }
107 
108     /**
109      * Convenience utility to compare two Object[]s
110      * Ought to be in System.
111      * @param len the length to compare.
112      * The start indices and start+len must be valid.
113      */
arrayRegionMatches(char[] source, int sourceStart, char[] target, int targetStart, int len)114     public final static boolean arrayRegionMatches(char[] source, int sourceStart,
115             char[] target, int targetStart,
116             int len)
117     {
118         int sourceEnd = sourceStart + len;
119         int delta = targetStart - sourceStart;
120         for (int i = sourceStart; i < sourceEnd; i++) {
121             if (source[i]!=target[i + delta])
122                 return false;
123         }
124         return true;
125     }
126 
127     /**
128      * Convenience utility to compare two int[]s.
129      * @param len the length to compare.
130      * The start indices and start+len must be valid.
131      * Ought to be in System
132      */
arrayRegionMatches(int[] source, int sourceStart, int[] target, int targetStart, int len)133     public final static boolean arrayRegionMatches(int[] source, int sourceStart,
134             int[] target, int targetStart,
135             int len)
136     {
137         int sourceEnd = sourceStart + len;
138         int delta = targetStart - sourceStart;
139         for (int i = sourceStart; i < sourceEnd; i++) {
140             if (source[i] != target[i + delta])
141                 return false;
142         }
143         return true;
144     }
145 
146     /**
147      * Convenience utility to compare two arrays of doubles.
148      * @param len the length to compare.
149      * The start indices and start+len must be valid.
150      * Ought to be in System
151      */
arrayRegionMatches(double[] source, int sourceStart, double[] target, int targetStart, int len)152     public final static boolean arrayRegionMatches(double[] source, int sourceStart,
153             double[] target, int targetStart,
154             int len)
155     {
156         int sourceEnd = sourceStart + len;
157         int delta = targetStart - sourceStart;
158         for (int i = sourceStart; i < sourceEnd; i++) {
159             if (source[i] != target[i + delta])
160                 return false;
161         }
162         return true;
163     }
arrayRegionMatches(byte[] source, int sourceStart, byte[] target, int targetStart, int len)164     public final static boolean arrayRegionMatches(byte[] source, int sourceStart,
165             byte[] target, int targetStart, int len){
166         int sourceEnd = sourceStart + len;
167         int delta = targetStart - sourceStart;
168         for (int i = sourceStart; i < sourceEnd; i++) {
169             if (source[i] != target[i + delta])
170                 return false;
171         }
172         return true;
173     }
174 
175     /**
176      * Trivial reference equality.
177      * This method should help document that we really want == not equals(),
178      * and to have a single place to suppress warnings from static analysis tools.
179      */
sameObjects(Object a, Object b)180     public static final boolean sameObjects(Object a, Object b) {
181         return a == b;
182     }
183 
184     /**
185      * Convenience utility. Does null checks on objects, then calls compare.
186      */
checkCompare(T a, T b)187     public static <T extends Comparable<T>> int checkCompare(T a, T b) {
188         return a == null ?
189                 b == null ? 0 : -1 :
190                     b == null ? 1 : a.compareTo(b);
191       }
192 
193     /**
194      * Convenience utility. Does null checks on object, then calls hashCode.
195      */
checkHash(Object a)196     public static int checkHash(Object a) {
197         return a == null ? 0 : a.hashCode();
198       }
199 
200     /**
201      * The ESCAPE character is used during run-length encoding.  It signals
202      * a run of identical chars.
203      */
204     private static final char ESCAPE = '\uA5A5';
205 
206     /**
207      * The ESCAPE_BYTE character is used during run-length encoding.  It signals
208      * a run of identical bytes.
209      */
210     static final byte ESCAPE_BYTE = (byte)0xA5;
211 
212     /**
213      * Construct a string representing an int array.  Use run-length encoding.
214      * A character represents itself, unless it is the ESCAPE character.  Then
215      * the following notations are possible:
216      *   ESCAPE ESCAPE   ESCAPE literal
217      *   ESCAPE n c      n instances of character c
218      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
219      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
220      * If we encounter a run where n == ESCAPE, we represent this as:
221      *   c ESCAPE n-1 c
222      * The ESCAPE value is chosen so as not to collide with commonly
223      * seen values.
224      */
arrayToRLEString(int[] a)225     static public final String arrayToRLEString(int[] a) {
226         StringBuilder buffer = new StringBuilder();
227 
228         appendInt(buffer, a.length);
229         int runValue = a[0];
230         int runLength = 1;
231         for (int i=1; i<a.length; ++i) {
232             int s = a[i];
233             if (s == runValue && runLength < 0xFFFF) {
234                 ++runLength;
235             } else {
236                 encodeRun(buffer, runValue, runLength);
237                 runValue = s;
238                 runLength = 1;
239             }
240         }
241         encodeRun(buffer, runValue, runLength);
242         return buffer.toString();
243     }
244 
245     /**
246      * Construct a string representing a short array.  Use run-length encoding.
247      * A character represents itself, unless it is the ESCAPE character.  Then
248      * the following notations are possible:
249      *   ESCAPE ESCAPE   ESCAPE literal
250      *   ESCAPE n c      n instances of character c
251      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
252      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
253      * If we encounter a run where n == ESCAPE, we represent this as:
254      *   c ESCAPE n-1 c
255      * The ESCAPE value is chosen so as not to collide with commonly
256      * seen values.
257      */
arrayToRLEString(short[] a)258     static public final String arrayToRLEString(short[] a) {
259         StringBuilder buffer = new StringBuilder();
260         // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]);
261         buffer.append((char) (a.length >> 16));
262         buffer.append((char) a.length);
263         short runValue = a[0];
264         int runLength = 1;
265         for (int i=1; i<a.length; ++i) {
266             short s = a[i];
267             if (s == runValue && runLength < 0xFFFF) ++runLength;
268             else {
269                 encodeRun(buffer, runValue, runLength);
270                 runValue = s;
271                 runLength = 1;
272             }
273         }
274         encodeRun(buffer, runValue, runLength);
275         return buffer.toString();
276     }
277 
278     /**
279      * Construct a string representing a char array.  Use run-length encoding.
280      * A character represents itself, unless it is the ESCAPE character.  Then
281      * the following notations are possible:
282      *   ESCAPE ESCAPE   ESCAPE literal
283      *   ESCAPE n c      n instances of character c
284      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
285      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
286      * If we encounter a run where n == ESCAPE, we represent this as:
287      *   c ESCAPE n-1 c
288      * The ESCAPE value is chosen so as not to collide with commonly
289      * seen values.
290      */
arrayToRLEString(char[] a)291     static public final String arrayToRLEString(char[] a) {
292         StringBuilder buffer = new StringBuilder();
293         buffer.append((char) (a.length >> 16));
294         buffer.append((char) a.length);
295         char runValue = a[0];
296         int runLength = 1;
297         for (int i=1; i<a.length; ++i) {
298             char s = a[i];
299             if (s == runValue && runLength < 0xFFFF) ++runLength;
300             else {
301                 encodeRun(buffer, (short)runValue, runLength);
302                 runValue = s;
303                 runLength = 1;
304             }
305         }
306         encodeRun(buffer, (short)runValue, runLength);
307         return buffer.toString();
308     }
309 
310     /**
311      * Construct a string representing a byte array.  Use run-length encoding.
312      * Two bytes are packed into a single char, with a single extra zero byte at
313      * the end if needed.  A byte represents itself, unless it is the
314      * ESCAPE_BYTE.  Then the following notations are possible:
315      *   ESCAPE_BYTE ESCAPE_BYTE   ESCAPE_BYTE literal
316      *   ESCAPE_BYTE n b           n instances of byte b
317      * Since an encoded run occupies 3 bytes, we only encode runs of 4 or
318      * more bytes.  Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF.
319      * If we encounter a run where n == ESCAPE_BYTE, we represent this as:
320      *   b ESCAPE_BYTE n-1 b
321      * The ESCAPE_BYTE value is chosen so as not to collide with commonly
322      * seen values.
323      */
arrayToRLEString(byte[] a)324     static public final String arrayToRLEString(byte[] a) {
325         StringBuilder buffer = new StringBuilder();
326         buffer.append((char) (a.length >> 16));
327         buffer.append((char) a.length);
328         byte runValue = a[0];
329         int runLength = 1;
330         byte[] state = new byte[2];
331         for (int i=1; i<a.length; ++i) {
332             byte b = a[i];
333             if (b == runValue && runLength < 0xFF) ++runLength;
334             else {
335                 encodeRun(buffer, runValue, runLength, state);
336                 runValue = b;
337                 runLength = 1;
338             }
339         }
340         encodeRun(buffer, runValue, runLength, state);
341 
342         // We must save the final byte, if there is one, by padding
343         // an extra zero.
344         if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state);
345 
346         return buffer.toString();
347     }
348 
349     /**
350      * Encode a run, possibly a degenerate run (of < 4 values).
351      * @param length The length of the run; must be > 0 && <= 0xFFFF.
352      */
encodeRun(T buffer, int value, int length)353     private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) {
354         if (length < 4) {
355             for (int j=0; j<length; ++j) {
356                 if (value == ESCAPE) {
357                     appendInt(buffer, value);
358                 }
359                 appendInt(buffer, value);
360             }
361         }
362         else {
363             if (length == ESCAPE) {
364                 if (value == ESCAPE) {
365                     appendInt(buffer, ESCAPE);
366                 }
367                 appendInt(buffer, value);
368                 --length;
369             }
370             appendInt(buffer, ESCAPE);
371             appendInt(buffer, length);
372             appendInt(buffer, value); // Don't need to escape this value
373         }
374     }
375 
appendInt(T buffer, int value)376     private static final <T extends Appendable> void appendInt(T buffer, int value) {
377         try {
378             buffer.append((char)(value >>> 16));
379             buffer.append((char)(value & 0xFFFF));
380         } catch (IOException e) {
381             throw new IllegalIcuArgumentException(e);
382         }
383     }
384 
385     /**
386      * Encode a run, possibly a degenerate run (of < 4 values).
387      * @param length The length of the run; must be > 0 && <= 0xFFFF.
388      */
encodeRun(T buffer, short value, int length)389     private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) {
390         try {
391             char valueChar = (char) value;
392             if (length < 4) {
393                 for (int j=0; j<length; ++j) {
394                     if (valueChar == ESCAPE) {
395                         buffer.append(ESCAPE);
396                     }
397                     buffer.append(valueChar);
398                 }
399             }
400             else {
401                 if (length == ESCAPE) {
402                     if (valueChar == ESCAPE) {
403                         buffer.append(ESCAPE);
404                     }
405                     buffer.append(valueChar);
406                     --length;
407                 }
408                 buffer.append(ESCAPE);
409                 buffer.append((char) length);
410                 buffer.append(valueChar); // Don't need to escape this value
411             }
412         } catch (IOException e) {
413             throw new IllegalIcuArgumentException(e);
414         }
415     }
416 
417     /**
418      * Encode a run, possibly a degenerate run (of < 4 values).
419      * @param length The length of the run; must be > 0 && <= 0xFF.
420      */
encodeRun(T buffer, byte value, int length, byte[] state)421     private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length,
422             byte[] state) {
423         if (length < 4) {
424             for (int j=0; j<length; ++j) {
425                 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
426                 appendEncodedByte(buffer, value, state);
427             }
428         }
429         else {
430             if ((byte)length == ESCAPE_BYTE) {
431                 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
432                 appendEncodedByte(buffer, value, state);
433                 --length;
434             }
435             appendEncodedByte(buffer, ESCAPE_BYTE, state);
436             appendEncodedByte(buffer, (byte)length, state);
437             appendEncodedByte(buffer, value, state); // Don't need to escape this value
438         }
439     }
440 
441     /**
442      * Append a byte to the given Appendable, packing two bytes into each
443      * character.  The state parameter maintains intermediary data between
444      * calls.
445      * @param state A two-element array, with state[0] == 0 if this is the
446      * first byte of a pair, or state[0] != 0 if this is the second byte
447      * of a pair, in which case state[1] is the first byte.
448      */
appendEncodedByte(T buffer, byte value, byte[] state)449     private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value,
450             byte[] state) {
451         try {
452             if (state[0] != 0) {
453                 char c = (char) ((state[1] << 8) | ((value) & 0xFF));
454                 buffer.append(c);
455                 state[0] = 0;
456             }
457             else {
458                 state[0] = 1;
459                 state[1] = value;
460             }
461         } catch (IOException e) {
462             throw new IllegalIcuArgumentException(e);
463         }
464     }
465 
466     /**
467      * Construct an array of ints from a run-length encoded string.
468      */
RLEStringToIntArray(String s)469     static public final int[] RLEStringToIntArray(String s) {
470         int length = getInt(s, 0);
471         int[] array = new int[length];
472         int ai = 0, i = 1;
473 
474         int maxI = s.length() / 2;
475         while (ai < length && i < maxI) {
476             int c = getInt(s, i++);
477 
478             if (c == ESCAPE) {
479                 c = getInt(s, i++);
480                 if (c == ESCAPE) {
481                     array[ai++] = c;
482                 } else {
483                     int runLength = c;
484                     int runValue = getInt(s, i++);
485                     for (int j=0; j<runLength; ++j) {
486                         array[ai++] = runValue;
487                     }
488                 }
489             }
490             else {
491                 array[ai++] = c;
492             }
493         }
494 
495         if (ai != length || i != maxI) {
496             throw new IllegalStateException("Bad run-length encoded int array");
497         }
498 
499         return array;
500     }
getInt(String s, int i)501     static final int getInt(String s, int i) {
502         return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1);
503     }
504 
505     /**
506      * Construct an array of shorts from a run-length encoded string.
507      */
RLEStringToShortArray(String s)508     static public final short[] RLEStringToShortArray(String s) {
509         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
510         short[] array = new short[length];
511         int ai = 0;
512         for (int i=2; i<s.length(); ++i) {
513             char c = s.charAt(i);
514             if (c == ESCAPE) {
515                 c = s.charAt(++i);
516                 if (c == ESCAPE) {
517                     array[ai++] = (short) c;
518                 } else {
519                     int runLength = c;
520                     short runValue = (short) s.charAt(++i);
521                     for (int j=0; j<runLength; ++j) array[ai++] = runValue;
522                 }
523             }
524             else {
525                 array[ai++] = (short) c;
526             }
527         }
528 
529         if (ai != length)
530             throw new IllegalStateException("Bad run-length encoded short array");
531 
532         return array;
533     }
534 
535     /**
536      * Construct an array of shorts from a run-length encoded string.
537      */
RLEStringToCharArray(String s)538     static public final char[] RLEStringToCharArray(String s) {
539         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
540         char[] array = new char[length];
541         int ai = 0;
542         for (int i=2; i<s.length(); ++i) {
543             char c = s.charAt(i);
544             if (c == ESCAPE) {
545                 c = s.charAt(++i);
546                 if (c == ESCAPE) {
547                     array[ai++] = c;
548                 } else {
549                     int runLength = c;
550                     char runValue = s.charAt(++i);
551                     for (int j=0; j<runLength; ++j) array[ai++] = runValue;
552                 }
553             }
554             else {
555                 array[ai++] = c;
556             }
557         }
558 
559         if (ai != length)
560             throw new IllegalStateException("Bad run-length encoded short array");
561 
562         return array;
563     }
564 
565     /**
566      * Construct an array of bytes from a run-length encoded string.
567      */
RLEStringToByteArray(String s)568     static public final byte[] RLEStringToByteArray(String s) {
569         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
570         byte[] array = new byte[length];
571         boolean nextChar = true;
572         char c = 0;
573         int node = 0;
574         int runLength = 0;
575         int i = 2;
576         for (int ai=0; ai<length; ) {
577             // This part of the loop places the next byte into the local
578             // variable 'b' each time through the loop.  It keeps the
579             // current character in 'c' and uses the boolean 'nextChar'
580             // to see if we've taken both bytes out of 'c' yet.
581             byte b;
582             if (nextChar) {
583                 c = s.charAt(i++);
584                 b = (byte) (c >> 8);
585                 nextChar = false;
586             }
587             else {
588                 b = (byte) (c & 0xFF);
589                 nextChar = true;
590             }
591 
592             // This part of the loop is a tiny state machine which handles
593             // the parsing of the run-length encoding.  This would be simpler
594             // if we could look ahead, but we can't, so we use 'node' to
595             // move between three nodes in the state machine.
596             switch (node) {
597             case 0:
598                 // Normal idle node
599                 if (b == ESCAPE_BYTE) {
600                     node = 1;
601                 }
602                 else {
603                     array[ai++] = b;
604                 }
605                 break;
606             case 1:
607                 // We have seen one ESCAPE_BYTE; we expect either a second
608                 // one, or a run length and value.
609                 if (b == ESCAPE_BYTE) {
610                     array[ai++] = ESCAPE_BYTE;
611                     node = 0;
612                 }
613                 else {
614                     runLength = b;
615                     // Interpret signed byte as unsigned
616                     if (runLength < 0) runLength += 0x100;
617                     node = 2;
618                 }
619                 break;
620             case 2:
621                 // We have seen an ESCAPE_BYTE and length byte.  We interpret
622                 // the next byte as the value to be repeated.
623                 for (int j=0; j<runLength; ++j) array[ai++] = b;
624                 node = 0;
625                 break;
626             }
627         }
628 
629         if (node != 0)
630             throw new IllegalStateException("Bad run-length encoded byte array");
631 
632         if (i != s.length())
633             throw new IllegalStateException("Excess data in RLE byte array string");
634 
635         return array;
636     }
637 
638     static public String LINE_SEPARATOR = System.getProperty("line.separator");
639 
640     /**
641      * Format a String for representation in a source file.  This includes
642      * breaking it into lines and escaping characters using octal notation
643      * when necessary (control characters and double quotes).
644      */
formatForSource(String s)645     static public final String formatForSource(String s) {
646         StringBuilder buffer = new StringBuilder();
647         for (int i=0; i<s.length();) {
648             if (i > 0) buffer.append('+').append(LINE_SEPARATOR);
649             buffer.append("        \"");
650             int count = 11;
651             while (i<s.length() && count<80) {
652                 char c = s.charAt(i++);
653                 if (c < '\u0020' || c == '"' || c == '\\') {
654                     if (c == '\n') {
655                         buffer.append("\\n");
656                         count += 2;
657                     } else if (c == '\t') {
658                         buffer.append("\\t");
659                         count += 2;
660                     } else if (c == '\r') {
661                         buffer.append("\\r");
662                         count += 2;
663                     } else {
664                         // Represent control characters, backslash and double quote
665                         // using octal notation; otherwise the string we form
666                         // won't compile, since Unicode escape sequences are
667                         // processed before tokenization.
668                         buffer.append('\\');
669                         buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
670                         buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
671                         buffer.append(HEX_DIGIT[(c & 0007)]);
672                         count += 4;
673                     }
674                 }
675                 else if (c <= '\u007E') {
676                     buffer.append(c);
677                     count += 1;
678                 }
679                 else {
680                     buffer.append("\\u");
681                     buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
682                     buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
683                     buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
684                     buffer.append(HEX_DIGIT[(c & 0x000F)]);
685                     count += 6;
686                 }
687             }
688             buffer.append('"');
689         }
690         return buffer.toString();
691     }
692 
693     static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7',
694         '8','9','A','B','C','D','E','F'};
695 
696     /**
697      * Format a String for representation in a source file.  Like
698      * formatForSource but does not do line breaking.
699      */
format1ForSource(String s)700     static public final String format1ForSource(String s) {
701         StringBuilder buffer = new StringBuilder();
702         buffer.append("\"");
703         for (int i=0; i<s.length();) {
704             char c = s.charAt(i++);
705             if (c < '\u0020' || c == '"' || c == '\\') {
706                 if (c == '\n') {
707                     buffer.append("\\n");
708                 } else if (c == '\t') {
709                     buffer.append("\\t");
710                 } else if (c == '\r') {
711                     buffer.append("\\r");
712                 } else {
713                     // Represent control characters, backslash and double quote
714                     // using octal notation; otherwise the string we form
715                     // won't compile, since Unicode escape sequences are
716                     // processed before tokenization.
717                     buffer.append('\\');
718                     buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
719                     buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
720                     buffer.append(HEX_DIGIT[(c & 0007)]);
721                 }
722             }
723             else if (c <= '\u007E') {
724                 buffer.append(c);
725             }
726             else {
727                 buffer.append("\\u");
728                 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
729                 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
730                 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
731                 buffer.append(HEX_DIGIT[(c & 0x000F)]);
732             }
733         }
734         buffer.append('"');
735         return buffer.toString();
736     }
737 
738     /**
739      * Convert characters outside the range U+0020 to U+007F to
740      * Unicode escapes, and convert backslash to a double backslash.
741      */
escape(String s)742     public static final String escape(String s) {
743         StringBuilder buf = new StringBuilder();
744         for (int i=0; i<s.length(); ) {
745             int c = Character.codePointAt(s, i);
746             i += UTF16.getCharCount(c);
747             if (c >= ' ' && c <= 0x007F) {
748                 if (c == '\\') {
749                     buf.append("\\\\"); // That is, "\\"
750                 } else {
751                     buf.append((char)c);
752                 }
753             } else {
754                 boolean four = c <= 0xFFFF;
755                 buf.append(four ? "\\u" : "\\U");
756                 buf.append(hex(c, four ? 4 : 8));
757             }
758         }
759         return buf.toString();
760     }
761 
762     /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
763     static private final char[] UNESCAPE_MAP = {
764         /*"   0x22, 0x22 */
765         /*'   0x27, 0x27 */
766         /*?   0x3F, 0x3F */
767         /*\   0x5C, 0x5C */
768         /*a*/ 0x61, 0x07,
769         /*b*/ 0x62, 0x08,
770         /*e*/ 0x65, 0x1b,
771         /*f*/ 0x66, 0x0c,
772         /*n*/ 0x6E, 0x0a,
773         /*r*/ 0x72, 0x0d,
774         /*t*/ 0x74, 0x09,
775         /*v*/ 0x76, 0x0b
776     };
777 
778     /**
779      * Convert an escape to a 32-bit code point value.  We attempt
780      * to parallel the icu4c unescapeAt() function.
781      * @param offset16 an array containing offset to the character
782      * <em>after</em> the backslash.  Upon return offset16[0] will
783      * be updated to point after the escape sequence.
784      * @return character value from 0 to 10FFFF, or -1 on error.
785      */
unescapeAt(String s, int[] offset16)786     public static int unescapeAt(String s, int[] offset16) {
787         int c;
788         int result = 0;
789         int n = 0;
790         int minDig = 0;
791         int maxDig = 0;
792         int bitsPerDigit = 4;
793         int dig;
794         int i;
795         boolean braces = false;
796 
797         /* Check that offset is in range */
798         int offset = offset16[0];
799         int length = s.length();
800         if (offset < 0 || offset >= length) {
801             return -1;
802         }
803 
804         /* Fetch first UChar after '\\' */
805         c = Character.codePointAt(s, offset);
806         offset += UTF16.getCharCount(c);
807 
808         /* Convert hexadecimal and octal escapes */
809         switch (c) {
810         case 'u':
811             minDig = maxDig = 4;
812             break;
813         case 'U':
814             minDig = maxDig = 8;
815             break;
816         case 'x':
817             minDig = 1;
818             if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
819                 ++offset;
820                 braces = true;
821                 maxDig = 8;
822             } else {
823                 maxDig = 2;
824             }
825             break;
826         default:
827             dig = UCharacter.digit(c, 8);
828             if (dig >= 0) {
829                 minDig = 1;
830                 maxDig = 3;
831                 n = 1; /* Already have first octal digit */
832                 bitsPerDigit = 3;
833                 result = dig;
834             }
835             break;
836         }
837         if (minDig != 0) {
838             while (offset < length && n < maxDig) {
839                 c = UTF16.charAt(s, offset);
840                 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
841                 if (dig < 0) {
842                     break;
843                 }
844                 result = (result << bitsPerDigit) | dig;
845                 offset += UTF16.getCharCount(c);
846                 ++n;
847             }
848             if (n < minDig) {
849                 return -1;
850             }
851             if (braces) {
852                 if (c != 0x7D /*}*/) {
853                     return -1;
854                 }
855                 ++offset;
856             }
857             if (result < 0 || result >= 0x110000) {
858                 return -1;
859             }
860             // If an escape sequence specifies a lead surrogate, see
861             // if there is a trail surrogate after it, either as an
862             // escape or as a literal.  If so, join them up into a
863             // supplementary.
864             if (offset < length &&
865                     UTF16.isLeadSurrogate((char) result)) {
866                 int ahead = offset+1;
867                 c = s.charAt(offset); // [sic] get 16-bit code unit
868                 if (c == '\\' && ahead < length) {
869                     int o[] = new int[] { ahead };
870                     c = unescapeAt(s, o);
871                     ahead = o[0];
872                 }
873                 if (UTF16.isTrailSurrogate((char) c)) {
874                     offset = ahead;
875                     result = Character.toCodePoint((char) result, (char) c);
876                 }
877             }
878             offset16[0] = offset;
879             return result;
880         }
881 
882         /* Convert C-style escapes in table */
883         for (i=0; i<UNESCAPE_MAP.length; i+=2) {
884             if (c == UNESCAPE_MAP[i]) {
885                 offset16[0] = offset;
886                 return UNESCAPE_MAP[i+1];
887             } else if (c < UNESCAPE_MAP[i]) {
888                 break;
889             }
890         }
891 
892         /* Map \cX to control-X: X & 0x1F */
893         if (c == 'c' && offset < length) {
894             c = UTF16.charAt(s, offset);
895             offset16[0] = offset + UTF16.getCharCount(c);
896             return 0x1F & c;
897         }
898 
899         /* If no special forms are recognized, then consider
900          * the backslash to generically escape the next character. */
901         offset16[0] = offset;
902         return c;
903     }
904 
905     /**
906      * Convert all escapes in a given string using unescapeAt().
907      * @exception IllegalArgumentException if an invalid escape is
908      * seen.
909      */
unescape(String s)910     public static String unescape(String s) {
911         StringBuilder buf = new StringBuilder();
912         int[] pos = new int[1];
913         for (int i=0; i<s.length(); ) {
914             char c = s.charAt(i++);
915             if (c == '\\') {
916                 pos[0] = i;
917                 int e = unescapeAt(s, pos);
918                 if (e < 0) {
919                     throw new IllegalArgumentException("Invalid escape sequence " +
920                             s.substring(i-1, Math.min(i+8, s.length())));
921                 }
922                 buf.appendCodePoint(e);
923                 i = pos[0];
924             } else {
925                 buf.append(c);
926             }
927         }
928         return buf.toString();
929     }
930 
931     /**
932      * Convert all escapes in a given string using unescapeAt().
933      * Leave invalid escape sequences unchanged.
934      */
unescapeLeniently(String s)935     public static String unescapeLeniently(String s) {
936         StringBuilder buf = new StringBuilder();
937         int[] pos = new int[1];
938         for (int i=0; i<s.length(); ) {
939             char c = s.charAt(i++);
940             if (c == '\\') {
941                 pos[0] = i;
942                 int e = unescapeAt(s, pos);
943                 if (e < 0) {
944                     buf.append(c);
945                 } else {
946                     buf.appendCodePoint(e);
947                     i = pos[0];
948                 }
949             } else {
950                 buf.append(c);
951             }
952         }
953         return buf.toString();
954     }
955 
956     /**
957      * Convert a char to 4 hex uppercase digits.  E.g., hex('a') =>
958      * "0041".
959      */
hex(long ch)960     public static String hex(long ch) {
961         return hex(ch, 4);
962     }
963 
964     /**
965      * Supplies a zero-padded hex representation of an integer (without 0x)
966      */
hex(long i, int places)967     static public String hex(long i, int places) {
968         if (i == Long.MIN_VALUE) return "-8000000000000000";
969         boolean negative = i < 0;
970         if (negative) {
971             i = -i;
972         }
973         String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
974         if (result.length() < places) {
975             result = "0000000000000000".substring(result.length(),places) + result;
976         }
977         if (negative) {
978             return '-' + result;
979         }
980         return result;
981     }
982 
983     /**
984      * Convert a string to comma-separated groups of 4 hex uppercase
985      * digits.  E.g., hex('ab') => "0041,0042".
986      */
987     public static String hex(CharSequence s) {
988         return hex(s, 4, ",", true, new StringBuilder()).toString();
989     }
990 
991     /**
992      * Convert a string to separated groups of hex uppercase
993      * digits.  E.g., hex('ab'...) => "0041,0042".  Append the output
994      * to the given Appendable.
995      */
996     public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) {
997         try {
998             if (useCodePoints) {
999                 int cp;
1000                 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1001                     cp = Character.codePointAt(s, i);
1002                     if (i != 0) {
1003                         result.append(separator);
1004                     }
1005                     result.append(hex(cp,width));
1006                 }
1007             } else {
1008                 for (int i = 0; i < s.length(); ++i) {
1009                     if (i != 0) {
1010                         result.append(separator);
1011                     }
1012                     result.append(hex(s.charAt(i),width));
1013                 }
1014             }
1015             return result;
1016         } catch (IOException e) {
1017             throw new IllegalIcuArgumentException(e);
1018         }
1019     }
1020 
1021     public static String hex(byte[] o, int start, int end, String separator) {
1022         StringBuilder result = new StringBuilder();
1023         //int ch;
1024         for (int i = start; i < end; ++i) {
1025           if (i != 0) result.append(separator);
1026           result.append(hex(o[i]));
1027         }
1028         return result.toString();
1029       }
1030 
1031     /**
1032      * Convert a string to comma-separated groups of 4 hex uppercase
1033      * digits.  E.g., hex('ab') => "0041,0042".
1034      */
1035     public static <S extends CharSequence> String hex(S s, int width, S separator) {
1036         return hex(s, width, separator, true, new StringBuilder()).toString();
1037     }
1038 
1039     /**
1040      * Split a string into pieces based on the given divider character
1041      * @param s the string to split
1042      * @param divider the character on which to split.  Occurrences of
1043      * this character are not included in the output
1044      * @param output an array to receive the substrings between
1045      * instances of divider.  It must be large enough on entry to
1046      * accomodate all output.  Adjacent instances of the divider
1047      * character will place empty strings into output.  Before
1048      * returning, output is padded out with empty strings.
1049      */
1050     public static void split(String s, char divider, String[] output) {
1051         int last = 0;
1052         int current = 0;
1053         int i;
1054         for (i = 0; i < s.length(); ++i) {
1055             if (s.charAt(i) == divider) {
1056                 output[current++] = s.substring(last,i);
1057                 last = i+1;
1058             }
1059         }
1060         output[current++] = s.substring(last,i);
1061         while (current < output.length) {
1062             output[current++] = "";
1063         }
1064     }
1065 
1066     /**
1067      * Split a string into pieces based on the given divider character
1068      * @param s the string to split
1069      * @param divider the character on which to split.  Occurrences of
1070      * this character are not included in the output
1071      * @return output an array to receive the substrings between
1072      * instances of divider. Adjacent instances of the divider
1073      * character will place empty strings into output.
1074      */
1075     public static String[] split(String s, char divider) {
1076         int last = 0;
1077         int i;
1078         ArrayList<String> output = new ArrayList<>();
1079         for (i = 0; i < s.length(); ++i) {
1080             if (s.charAt(i) == divider) {
1081                 output.add(s.substring(last,i));
1082                 last = i+1;
1083             }
1084         }
1085         output.add( s.substring(last,i));
1086         return output.toArray(new String[output.size()]);
1087     }
1088 
1089     /**
1090      * Look up a given string in a string array.  Returns the index at
1091      * which the first occurrence of the string was found in the
1092      * array, or -1 if it was not found.
1093      * @param source the string to search for
1094      * @param target the array of zero or more strings in which to
1095      * look for source
1096      * @return the index of target at which source first occurs, or -1
1097      * if not found
1098      */
1099     public static int lookup(String source, String[] target) {
1100         for (int i = 0; i < target.length; ++i) {
1101             if (source.equals(target[i])) return i;
1102         }
1103         return -1;
1104     }
1105 
1106     /**
1107      * Parse a single non-whitespace character 'ch', optionally
1108      * preceded by whitespace.
1109      * @param id the string to be parsed
1110      * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
1111      * offset of the first character to be parsed.  On output, pos[0]
1112      * is the index after the last parsed character.  If the parse
1113      * fails, pos[0] will be unchanged.
1114      * @param ch the non-whitespace character to be parsed.
1115      * @return true if 'ch' is seen preceded by zero or more
1116      * whitespace characters.
1117      */
1118     public static boolean parseChar(String id, int[] pos, char ch) {
1119         int start = pos[0];
1120         pos[0] = PatternProps.skipWhiteSpace(id, pos[0]);
1121         if (pos[0] == id.length() ||
1122                 id.charAt(pos[0]) != ch) {
1123             pos[0] = start;
1124             return false;
1125         }
1126         ++pos[0];
1127         return true;
1128     }
1129 
1130     /**
1131      * Parse a pattern string starting at offset pos.  Keywords are
1132      * matched case-insensitively.  Spaces may be skipped and may be
1133      * optional or required.  Integer values may be parsed, and if
1134      * they are, they will be returned in the given array.  If
1135      * successful, the offset of the next non-space character is
1136      * returned.  On failure, -1 is returned.
1137      * @param pattern must only contain lowercase characters, which
1138      * will match their uppercase equivalents as well.  A space
1139      * character matches one or more required spaces.  A '~' character
1140      * matches zero or more optional spaces.  A '#' character matches
1141      * an integer and stores it in parsedInts, which the caller must
1142      * ensure has enough capacity.
1143      * @param parsedInts array to receive parsed integers.  Caller
1144      * must ensure that parsedInts.length is >= the number of '#'
1145      * signs in 'pattern'.
1146      * @return the position after the last character parsed, or -1 if
1147      * the parse failed
1148      */
1149     @SuppressWarnings("fallthrough")
1150     public static int parsePattern(String rule, int pos, int limit,
1151             String pattern, int[] parsedInts) {
1152         // TODO Update this to handle surrogates
1153         int[] p = new int[1];
1154         int intCount = 0; // number of integers parsed
1155         for (int i=0; i<pattern.length(); ++i) {
1156             char cpat = pattern.charAt(i);
1157             char c;
1158             switch (cpat) {
1159             case ' ':
1160                 if (pos >= limit) {
1161                     return -1;
1162                 }
1163                 c = rule.charAt(pos++);
1164                 if (!PatternProps.isWhiteSpace(c)) {
1165                     return -1;
1166                 }
1167                 // FALL THROUGH to skipWhitespace
1168             case '~':
1169                 pos = PatternProps.skipWhiteSpace(rule, pos);
1170                 break;
1171             case '#':
1172                 p[0] = pos;
1173                 parsedInts[intCount++] = parseInteger(rule, p, limit);
1174                 if (p[0] == pos) {
1175                     // Syntax error; failed to parse integer
1176                     return -1;
1177                 }
1178                 pos = p[0];
1179                 break;
1180             default:
1181                 if (pos >= limit) {
1182                     return -1;
1183                 }
1184                 c = (char) UCharacter.toLowerCase(rule.charAt(pos++));
1185                 if (c != cpat) {
1186                     return -1;
1187                 }
1188                 break;
1189             }
1190         }
1191         return pos;
1192     }
1193 
1194     /**
1195      * Parse a pattern string within the given Replaceable and a parsing
1196      * pattern.  Characters are matched literally and case-sensitively
1197      * except for the following special characters:
1198      *
1199      * ~  zero or more Pattern_White_Space chars
1200      *
1201      * If end of pattern is reached with all matches along the way,
1202      * pos is advanced to the first unparsed index and returned.
1203      * Otherwise -1 is returned.
1204      * @param pat pattern that controls parsing
1205      * @param text text to be parsed, starting at index
1206      * @param index offset to first character to parse
1207      * @param limit offset after last character to parse
1208      * @return index after last parsed character, or -1 on parse failure.
1209      */
1210     public static int parsePattern(String pat,
1211             Replaceable text,
1212             int index,
1213             int limit) {
1214         int ipat = 0;
1215 
1216         // empty pattern matches immediately
1217         if (ipat == pat.length()) {
1218             return index;
1219         }
1220 
1221         int cpat = Character.codePointAt(pat, ipat);
1222 
1223         while (index < limit) {
1224             int c = text.char32At(index);
1225 
1226             // parse \s*
1227             if (cpat == '~') {
1228                 if (PatternProps.isWhiteSpace(c)) {
1229                     index += UTF16.getCharCount(c);
1230                     continue;
1231                 } else {
1232                     if (++ipat == pat.length()) {
1233                         return index; // success; c unparsed
1234                     }
1235                     // fall thru; process c again with next cpat
1236                 }
1237             }
1238 
1239             // parse literal
1240             else if (c == cpat) {
1241                 int n = UTF16.getCharCount(c);
1242                 index += n;
1243                 ipat += n;
1244                 if (ipat == pat.length()) {
1245                     return index; // success; c parsed
1246                 }
1247                 // fall thru; get next cpat
1248             }
1249 
1250             // match failure of literal
1251             else {
1252                 return -1;
1253             }
1254 
1255             cpat = UTF16.charAt(pat, ipat);
1256         }
1257 
1258         return -1; // text ended before end of pat
1259     }
1260 
1261     /**
1262      * Parse an integer at pos, either of the form \d+ or of the form
1263      * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
1264      * or octal format.
1265      * @param pos INPUT-OUTPUT parameter.  On input, the first
1266      * character to parse.  On output, the character after the last
1267      * parsed character.
1268      */
1269     public static int parseInteger(String rule, int[] pos, int limit) {
1270         int count = 0;
1271         int value = 0;
1272         int p = pos[0];
1273         int radix = 10;
1274 
1275         if (rule.regionMatches(true, p, "0x", 0, 2)) {
1276             p += 2;
1277             radix = 16;
1278         } else if (p < limit && rule.charAt(p) == '0') {
1279             p++;
1280             count = 1;
1281             radix = 8;
1282         }
1283 
1284         while (p < limit) {
1285             int d = UCharacter.digit(rule.charAt(p++), radix);
1286             if (d < 0) {
1287                 --p;
1288                 break;
1289             }
1290             ++count;
1291             int v = (value * radix) + d;
1292             if (v <= value) {
1293                 // If there are too many input digits, at some point
1294                 // the value will go negative, e.g., if we have seen
1295                 // "0x8000000" already and there is another '0', when
1296                 // we parse the next 0 the value will go negative.
1297                 return 0;
1298             }
1299             value = v;
1300         }
1301         if (count > 0) {
1302             pos[0] = p;
1303         }
1304         return value;
1305     }
1306 
1307     /**
1308      * Parse a Unicode identifier from the given string at the given
1309      * position.  Return the identifier, or null if there is no
1310      * identifier.
1311      * @param str the string to parse
1312      * @param pos INPUT-OUPUT parameter.  On INPUT, pos[0] is the
1313      * first character to examine.  It must be less than str.length(),
1314      * and it must not point to a whitespace character.  That is, must
1315      * have pos[0] < str.length().  On
1316      * OUTPUT, the position after the last parsed character.
1317      * @return the Unicode identifier, or null if there is no valid
1318      * identifier at pos[0].
1319      */
1320     public static String parseUnicodeIdentifier(String str, int[] pos) {
1321         // assert(pos[0] < str.length());
1322         StringBuilder buf = new StringBuilder();
1323         int p = pos[0];
1324         while (p < str.length()) {
1325             int ch = Character.codePointAt(str, p);
1326             if (buf.length() == 0) {
1327                 if (UCharacter.isUnicodeIdentifierStart(ch)) {
1328                     buf.appendCodePoint(ch);
1329                 } else {
1330                     return null;
1331                 }
1332             } else {
1333                 if (UCharacter.isUnicodeIdentifierPart(ch)) {
1334                     buf.appendCodePoint(ch);
1335                 } else {
1336                     break;
1337                 }
1338             }
1339             p += UTF16.getCharCount(ch);
1340         }
1341         pos[0] = p;
1342         return buf.toString();
1343     }
1344 
1345     static final char DIGITS[] = {
1346         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
1347         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
1348         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
1349         'U', 'V', 'W', 'X', 'Y', 'Z'
1350     };
1351 
1352     /**
1353      * Append the digits of a positive integer to the given
1354      * <code>Appendable</code> in the given radix. This is
1355      * done recursively since it is easiest to generate the low-
1356      * order digit first, but it must be appended last.
1357      *
1358      * @param result is the <code>Appendable</code> to append to
1359      * @param n is the positive integer
1360      * @param radix is the radix, from 2 to 36 inclusive
1361      * @param minDigits is the minimum number of digits to append.
1362      */
1363     private static <T extends Appendable> void recursiveAppendNumber(T result, int n,
1364             int radix, int minDigits)
1365     {
1366         try {
1367             int digit = n % radix;
1368 
1369             if (n >= radix || minDigits > 1) {
1370                 recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
1371             }
1372             result.append(DIGITS[digit]);
1373         } catch (IOException e) {
1374             throw new IllegalIcuArgumentException(e);
1375         }
1376     }
1377 
1378     /**
1379      * Append a number to the given Appendable in the given radix.
1380      * Standard digits '0'-'9' are used and letters 'A'-'Z' for
1381      * radices 11 through 36.
1382      * @param result the digits of the number are appended here
1383      * @param n the number to be converted to digits; may be negative.
1384      * If negative, a '-' is prepended to the digits.
1385      * @param radix a radix from 2 to 36 inclusive.
1386      * @param minDigits the minimum number of digits, not including
1387      * any '-', to produce.  Values less than 2 have no effect.  One
1388      * digit is always emitted regardless of this parameter.
1389      * @return a reference to result
1390      */
1391     public static <T extends Appendable> T appendNumber(T result, int n,
1392             int radix, int minDigits)
1393     {
1394         try {
1395             if (radix < 2 || radix > 36) {
1396                 throw new IllegalArgumentException("Illegal radix " + radix);
1397             }
1398 
1399 
1400             int abs = n;
1401 
1402             if (n < 0) {
1403                 abs = -n;
1404                 result.append("-");
1405             }
1406 
1407             recursiveAppendNumber(result, abs, radix, minDigits);
1408 
1409             return result;
1410         } catch (IOException e) {
1411             throw new IllegalIcuArgumentException(e);
1412         }
1413 
1414     }
1415 
1416     /**
1417      * Parse an unsigned 31-bit integer at the given offset.  Use
1418      * UCharacter.digit() to parse individual characters into digits.
1419      * @param text the text to be parsed
1420      * @param pos INPUT-OUTPUT parameter.  On entry, pos[0] is the
1421      * offset within text at which to start parsing; it should point
1422      * to a valid digit.  On exit, pos[0] is the offset after the last
1423      * parsed character.  If the parse failed, it will be unchanged on
1424      * exit.  Must be >= 0 on entry.
1425      * @param radix the radix in which to parse; must be >= 2 and <=
1426      * 36.
1427      * @return a non-negative parsed number, or -1 upon parse failure.
1428      * Parse fails if there are no digits, that is, if pos[0] does not
1429      * point to a valid digit on entry, or if the number to be parsed
1430      * does not fit into a 31-bit unsigned integer.
1431      */
1432     public static int parseNumber(String text, int[] pos, int radix) {
1433         // assert(pos[0] >= 0);
1434         // assert(radix >= 2);
1435         // assert(radix <= 36);
1436         int n = 0;
1437         int p = pos[0];
1438         while (p < text.length()) {
1439             int ch = Character.codePointAt(text, p);
1440             int d = UCharacter.digit(ch, radix);
1441             if (d < 0) {
1442                 break;
1443             }
1444             n = radix*n + d;
1445             // ASSUME that when a 32-bit integer overflows it becomes
1446             // negative.  E.g., 214748364 * 10 + 8 => negative value.
1447             if (n < 0) {
1448                 return -1;
1449             }
1450             ++p;
1451         }
1452         if (p == pos[0]) {
1453             return -1;
1454         }
1455         pos[0] = p;
1456         return n;
1457     }
1458 
1459     /**
1460      * Return true if the character is NOT printable ASCII.  The tab,
1461      * newline and linefeed characters are considered unprintable.
1462      */
1463     public static boolean isUnprintable(int c) {
1464         //0x20 = 32 and 0x7E = 126
1465         return !(c >= 0x20 && c <= 0x7E);
1466     }
1467 
1468     /**
1469      * Escape unprintable characters using <backslash>uxxxx notation
1470      * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
1471      * above.  If the character is printable ASCII, then do nothing
1472      * and return FALSE.  Otherwise, append the escaped notation and
1473      * return TRUE.
1474      */
1475     public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
1476         try {
1477             if (isUnprintable(c)) {
1478                 result.append('\\');
1479                 if ((c & ~0xFFFF) != 0) {
1480                     result.append('U');
1481                     result.append(DIGITS[0xF&(c>>28)]);
1482                     result.append(DIGITS[0xF&(c>>24)]);
1483                     result.append(DIGITS[0xF&(c>>20)]);
1484                     result.append(DIGITS[0xF&(c>>16)]);
1485                 } else {
1486                     result.append('u');
1487                 }
1488                 result.append(DIGITS[0xF&(c>>12)]);
1489                 result.append(DIGITS[0xF&(c>>8)]);
1490                 result.append(DIGITS[0xF&(c>>4)]);
1491                 result.append(DIGITS[0xF&c]);
1492                 return true;
1493             }
1494             return false;
1495         } catch (IOException e) {
1496             throw new IllegalIcuArgumentException(e);
1497         }
1498     }
1499 
1500     /**
1501      * Returns the index of the first character in a set, ignoring quoted text.
1502      * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
1503      * found by a search for "h".  Unlike String.indexOf(), this method searches
1504      * not for a single character, but for any character of the string
1505      * <code>setOfChars</code>.
1506      * @param text text to be searched
1507      * @param start the beginning index, inclusive; <code>0 <= start
1508      * <= limit</code>.
1509      * @param limit the ending index, exclusive; <code>start <= limit
1510      * <= text.length()</code>.
1511      * @param setOfChars string with one or more distinct characters
1512      * @return Offset of the first character in <code>setOfChars</code>
1513      * found, or -1 if not found.
1514      * @see String#indexOf
1515      */
1516     public static int quotedIndexOf(String text, int start, int limit,
1517             String setOfChars) {
1518         for (int i=start; i<limit; ++i) {
1519             char c = text.charAt(i);
1520             if (c == BACKSLASH) {
1521                 ++i;
1522             } else if (c == APOSTROPHE) {
1523                 while (++i < limit
1524                         && text.charAt(i) != APOSTROPHE) {}
1525             } else if (setOfChars.indexOf(c) >= 0) {
1526                 return i;
1527             }
1528         }
1529         return -1;
1530     }
1531 
1532     /**
1533      * Append a character to a rule that is being built up.  To flush
1534      * the quoteBuf to rule, make one final call with isLiteral == true.
1535      * If there is no final character, pass in (int)-1 as c.
1536      * @param rule the string to append the character to
1537      * @param c the character to append, or (int)-1 if none.
1538      * @param isLiteral if true, then the given character should not be
1539      * quoted or escaped.  Usually this means it is a syntactic element
1540      * such as > or $
1541      * @param escapeUnprintable if true, then unprintable characters
1542      * should be escaped using escapeUnprintable().  These escapes will
1543      * appear outside of quotes.
1544      * @param quoteBuf a buffer which is used to build up quoted
1545      * substrings.  The caller should initially supply an empty buffer,
1546      * and thereafter should not modify the buffer.  The buffer should be
1547      * cleared out by, at the end, calling this method with a literal
1548      * character (which may be -1).
1549      */
1550     public static void appendToRule(StringBuffer rule,
1551             int c,
1552             boolean isLiteral,
1553             boolean escapeUnprintable,
1554             StringBuffer quoteBuf) {
1555         // If we are escaping unprintables, then escape them outside
1556         // quotes.  \\u and \\U are not recognized within quotes.  The same
1557         // logic applies to literals, but literals are never escaped.
1558         if (isLiteral ||
1559                 (escapeUnprintable && Utility.isUnprintable(c))) {
1560             if (quoteBuf.length() > 0) {
1561                 // We prefer backslash APOSTROPHE to double APOSTROPHE
1562                 // (more readable, less similar to ") so if there are
1563                 // double APOSTROPHEs at the ends, we pull them outside
1564                 // of the quote.
1565 
1566                 // If the first thing in the quoteBuf is APOSTROPHE
1567                 // (doubled) then pull it out.
1568                 while (quoteBuf.length() >= 2 &&
1569                         quoteBuf.charAt(0) == APOSTROPHE &&
1570                         quoteBuf.charAt(1) == APOSTROPHE) {
1571                     rule.append(BACKSLASH).append(APOSTROPHE);
1572                     quoteBuf.delete(0, 2);
1573                 }
1574                 // If the last thing in the quoteBuf is APOSTROPHE
1575                 // (doubled) then remove and count it and add it after.
1576                 int trailingCount = 0;
1577                 while (quoteBuf.length() >= 2 &&
1578                         quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
1579                         quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
1580                     quoteBuf.setLength(quoteBuf.length()-2);
1581                     ++trailingCount;
1582                 }
1583                 if (quoteBuf.length() > 0) {
1584                     rule.append(APOSTROPHE);
1585                     rule.append(quoteBuf);
1586                     rule.append(APOSTROPHE);
1587                     quoteBuf.setLength(0);
1588                 }
1589                 while (trailingCount-- > 0) {
1590                     rule.append(BACKSLASH).append(APOSTROPHE);
1591                 }
1592             }
1593             if (c != -1) {
1594                 /* Since spaces are ignored during parsing, they are
1595                  * emitted only for readability.  We emit one here
1596                  * only if there isn't already one at the end of the
1597                  * rule.
1598                  */
1599                 if (c == ' ') {
1600                     int len = rule.length();
1601                     if (len > 0 && rule.charAt(len-1) != ' ') {
1602                         rule.append(' ');
1603                     }
1604                 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) {
1605                     rule.appendCodePoint(c);
1606                 }
1607             }
1608         }
1609 
1610         // Escape ' and '\' and don't begin a quote just for them
1611         else if (quoteBuf.length() == 0 &&
1612                 (c == APOSTROPHE || c == BACKSLASH)) {
1613             rule.append(BACKSLASH).append((char)c);
1614         }
1615 
1616         // Specials (printable ascii that isn't [0-9a-zA-Z]) and
1617         // whitespace need quoting.  Also append stuff to quotes if we are
1618         // building up a quoted substring already.
1619         else if (quoteBuf.length() > 0 ||
1620                 (c >= 0x0021 && c <= 0x007E &&
1621                         !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
1622                                 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
1623                                 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
1624                                 PatternProps.isWhiteSpace(c)) {
1625             quoteBuf.appendCodePoint(c);
1626             // Double ' within a quote
1627             if (c == APOSTROPHE) {
1628                 quoteBuf.append((char)c);
1629             }
1630         }
1631 
1632         // Otherwise just append
1633         else {
1634             rule.appendCodePoint(c);
1635         }
1636     }
1637 
1638     /**
1639      * Append the given string to the rule.  Calls the single-character
1640      * version of appendToRule for each character.
1641      */
1642     public static void appendToRule(StringBuffer rule,
1643             String text,
1644             boolean isLiteral,
1645             boolean escapeUnprintable,
1646             StringBuffer quoteBuf) {
1647         for (int i=0; i<text.length(); ++i) {
1648             // Okay to process in 16-bit code units here
1649             appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
1650         }
1651     }
1652 
1653     /**
1654      * Given a matcher reference, which may be null, append its
1655      * pattern as a literal to the given rule.
1656      */
1657     public static void appendToRule(StringBuffer rule,
1658             UnicodeMatcher matcher,
1659             boolean escapeUnprintable,
1660             StringBuffer quoteBuf) {
1661         if (matcher != null) {
1662             appendToRule(rule, matcher.toPattern(escapeUnprintable),
1663                     true, escapeUnprintable, quoteBuf);
1664         }
1665     }
1666 
1667     /**
1668      * Compares 2 unsigned integers
1669      * @param source 32 bit unsigned integer
1670      * @param target 32 bit unsigned integer
1671      * @return 0 if equals, 1 if source is greater than target and -1
1672      *         otherwise
1673      */
1674     public static final int compareUnsigned(int source, int target)
1675     {
1676         source += MAGIC_UNSIGNED;
1677         target += MAGIC_UNSIGNED;
1678         if (source < target) {
1679             return -1;
1680         }
1681         else if (source > target) {
1682             return 1;
1683         }
1684         return 0;
1685     }
1686 
1687     /**
1688      * Find the highest bit in a positive integer. This is done
1689      * by doing a binary search through the bits.
1690      *
1691      * @param n is the integer
1692      *
1693      * @return the bit number of the highest bit, with 0 being
1694      * the low order bit, or -1 if <code>n</code> is not positive
1695      */
1696     public static final byte highBit(int n)
1697     {
1698         if (n <= 0) {
1699             return -1;
1700         }
1701 
1702         byte bit = 0;
1703 
1704         if (n >= 1 << 16) {
1705             n >>= 16;
1706         bit += 16;
1707         }
1708 
1709         if (n >= 1 << 8) {
1710             n >>= 8;
1711         bit += 8;
1712         }
1713 
1714         if (n >= 1 << 4) {
1715             n >>= 4;
1716         bit += 4;
1717         }
1718 
1719         if (n >= 1 << 2) {
1720             n >>= 2;
1721         bit += 2;
1722         }
1723 
1724         if (n >= 1 << 1) {
1725             n >>= 1;
1726         bit += 1;
1727         }
1728 
1729         return bit;
1730     }
1731     /**
1732      * Utility method to take a int[] containing codepoints and return
1733      * a string representation with code units.
1734      */
1735     public static String valueOf(int[]source){
1736         // TODO: Investigate why this method is not on UTF16 class
1737         StringBuilder result = new StringBuilder(source.length);
1738         for(int i=0; i<source.length; i++){
1739             result.appendCodePoint(source[i]);
1740         }
1741         return result.toString();
1742     }
1743 
1744 
1745     /**
1746      * Utility to duplicate a string count times
1747      * @param s String to be duplicated.
1748      * @param count Number of times to duplicate a string.
1749      */
1750     public static String repeat(String s, int count) {
1751         if (count <= 0) return "";
1752         if (count == 1) return s;
1753         StringBuilder result = new StringBuilder();
1754         for (int i = 0; i < count; ++i) {
1755             result.append(s);
1756         }
1757         return result.toString();
1758     }
1759 
1760     public static String[] splitString(String src, String target) {
1761         return src.split("\\Q" + target + "\\E");
1762     }
1763 
1764     /**
1765      * Split the string at runs of ascii whitespace characters.
1766      */
1767     public static String[] splitWhitespace(String src) {
1768         return src.split("\\s+");
1769     }
1770 
1771     /**
1772      * Parse a list of hex numbers and return a string
1773      * @param string String of hex numbers.
1774      * @param minLength Minimal length.
1775      * @param separator Separator.
1776      * @return A string from hex numbers.
1777      */
1778     public static String fromHex(String string, int minLength, String separator) {
1779         return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+"));
1780     }
1781 
1782     /**
1783      * Parse a list of hex numbers and return a string
1784      * @param string String of hex numbers.
1785      * @param minLength Minimal length.
1786      * @param separator Separator.
1787      * @return A string from hex numbers.
1788      */
1789     public static String fromHex(String string, int minLength, Pattern separator) {
1790         StringBuilder buffer = new StringBuilder();
1791         String[] parts = separator.split(string);
1792         for (String part : parts) {
1793             if (part.length() < minLength) {
1794                 throw new IllegalArgumentException("code point too short: " + part);
1795             }
1796             int cp = Integer.parseInt(part, 16);
1797             buffer.appendCodePoint(cp);
1798         }
1799         return buffer.toString();
1800     }
1801 
1802     /**
1803      * This implementation is equivalent to Java 8+ Math#addExact(int, int)
1804      * @param x the first value
1805      * @param y the second value
1806      * @return the result
1807      */
1808     public static int addExact(int x, int y) {
1809         int r = x + y;
1810         // HD 2-12 Overflow iff both arguments have the opposite sign of the result
1811         if (((x ^ r) & (y ^ r)) < 0) {
1812             throw new ArithmeticException("integer overflow");
1813         }
1814         return r;
1815     }
1816 
1817     /**
1818      * Returns whether the chars in the two CharSequences are equal.
1819      */
1820     public static boolean charSequenceEquals(CharSequence a, CharSequence b) {
1821         if (a == b) {
1822             return true;
1823         }
1824         if (a == null || b == null) {
1825             return false;
1826         }
1827         if (a.length() != b.length()) {
1828             return false;
1829         }
1830         for (int i = 0; i < a.length(); i++) {
1831             if (a.charAt(i) != b.charAt(i))
1832                 return false;
1833         }
1834         return true;
1835     }
1836 
1837     /**
1838      * Returns a hash code for a CharSequence that is equivalent to calling
1839      * charSequence.toString().hashCode()
1840      */
1841     public static int charSequenceHashCode(CharSequence value) {
1842         int hash = 0;
1843         for (int i = 0; i < value.length(); i++) {
1844             hash = hash * 31 + value.charAt(i);
1845         }
1846         return hash;
1847     }
1848 }
1849