• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 1996-2015, International Business Machines Corporation and    *
6  * others. All Rights Reserved.                                                *
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import java.io.IOException;
12 import java.util.ArrayList;
13 import java.util.Locale;
14 import java.util.regex.Pattern;
15 
16 import com.ibm.icu.lang.UCharacter;
17 import com.ibm.icu.text.Replaceable;
18 import com.ibm.icu.text.UTF16;
19 import com.ibm.icu.text.UnicodeMatcher;
20 import com.ibm.icu.util.ICUUncheckedIOException;
21 
22 public final class Utility {
23 
24     private static final char APOSTROPHE = '\'';
25     private static final char BACKSLASH  = '\\';
26     private static final int MAGIC_UNSIGNED = 0x80000000;
27 
28     /**
29      * Convenience utility to compare two Object[]s.
30      * Ought to be in System
31      */
arrayEquals(Object[] source, Object target)32     public final static boolean arrayEquals(Object[] source, Object target) {
33         if (source == null) return (target == null);
34         if (!(target instanceof Object[])) return false;
35         Object[] targ = (Object[]) target;
36         return (source.length == targ.length
37                 && arrayRegionMatches(source, 0, targ, 0, source.length));
38     }
39 
40     /**
41      * Convenience utility to compare two int[]s
42      * Ought to be in System
43      */
arrayEquals(int[] source, Object target)44     public final static boolean arrayEquals(int[] source, Object target) {
45         if (source == null) return (target == null);
46         if (!(target instanceof int[])) return false;
47         int[] targ = (int[]) target;
48         return (source.length == targ.length
49                 && arrayRegionMatches(source, 0, targ, 0, source.length));
50     }
51 
52     /**
53      * Convenience utility to compare two double[]s
54      * Ought to be in System
55      */
arrayEquals(double[] source, Object target)56     public final static boolean arrayEquals(double[] source, Object target) {
57         if (source == null) return (target == null);
58         if (!(target instanceof double[])) return false;
59         double[] targ = (double[]) target;
60         return (source.length == targ.length
61                 && arrayRegionMatches(source, 0, targ, 0, source.length));
62     }
arrayEquals(byte[] source, Object target)63     public final static boolean arrayEquals(byte[] source, Object target) {
64         if (source == null) return (target == null);
65         if (!(target instanceof byte[])) return false;
66         byte[] targ = (byte[]) target;
67         return (source.length == targ.length
68                 && arrayRegionMatches(source, 0, targ, 0, source.length));
69     }
70 
71     /**
72      * Convenience utility to compare two Object[]s
73      * Ought to be in System
74      */
arrayEquals(Object source, Object target)75     public final static boolean arrayEquals(Object source, Object target) {
76         if (source == null) return (target == null);
77         // for some reason, the correct arrayEquals is not being called
78         // so do it by hand for now.
79         if (source instanceof Object[])
80             return(arrayEquals((Object[]) source,target));
81         if (source instanceof int[])
82             return(arrayEquals((int[]) source,target));
83         if (source instanceof double[])
84             return(arrayEquals((double[]) source, target));
85         if (source instanceof byte[])
86             return(arrayEquals((byte[]) source,target));
87         return source.equals(target);
88     }
89 
90     /**
91      * Convenience utility to compare two Object[]s
92      * Ought to be in System.
93      * @param len the length to compare.
94      * The start indices and start+len must be valid.
95      */
arrayRegionMatches(Object[] source, int sourceStart, Object[] target, int targetStart, int len)96     public final static boolean arrayRegionMatches(Object[] source, int sourceStart,
97             Object[] target, int targetStart,
98             int len)
99     {
100         int sourceEnd = sourceStart + len;
101         int delta = targetStart - sourceStart;
102         for (int i = sourceStart; i < sourceEnd; i++) {
103             if (!arrayEquals(source[i],target[i + delta]))
104                 return false;
105         }
106         return true;
107     }
108 
109     /**
110      * Convenience utility to compare two Object[]s
111      * Ought to be in System.
112      * @param len the length to compare.
113      * The start indices and start+len must be valid.
114      */
arrayRegionMatches(char[] source, int sourceStart, char[] target, int targetStart, int len)115     public final static boolean arrayRegionMatches(char[] source, int sourceStart,
116             char[] target, int targetStart,
117             int len)
118     {
119         int sourceEnd = sourceStart + len;
120         int delta = targetStart - sourceStart;
121         for (int i = sourceStart; i < sourceEnd; i++) {
122             if (source[i]!=target[i + delta])
123                 return false;
124         }
125         return true;
126     }
127 
128     /**
129      * Convenience utility to compare two int[]s.
130      * @param len the length to compare.
131      * The start indices and start+len must be valid.
132      * Ought to be in System
133      */
arrayRegionMatches(int[] source, int sourceStart, int[] target, int targetStart, int len)134     public final static boolean arrayRegionMatches(int[] source, int sourceStart,
135             int[] target, int targetStart,
136             int len)
137     {
138         int sourceEnd = sourceStart + len;
139         int delta = targetStart - sourceStart;
140         for (int i = sourceStart; i < sourceEnd; i++) {
141             if (source[i] != target[i + delta])
142                 return false;
143         }
144         return true;
145     }
146 
147     /**
148      * Convenience utility to compare two arrays of doubles.
149      * @param len the length to compare.
150      * The start indices and start+len must be valid.
151      * Ought to be in System
152      */
arrayRegionMatches(double[] source, int sourceStart, double[] target, int targetStart, int len)153     public final static boolean arrayRegionMatches(double[] source, int sourceStart,
154             double[] target, int targetStart,
155             int len)
156     {
157         int sourceEnd = sourceStart + len;
158         int delta = targetStart - sourceStart;
159         for (int i = sourceStart; i < sourceEnd; i++) {
160             if (source[i] != target[i + delta])
161                 return false;
162         }
163         return true;
164     }
arrayRegionMatches(byte[] source, int sourceStart, byte[] target, int targetStart, int len)165     public final static boolean arrayRegionMatches(byte[] source, int sourceStart,
166             byte[] target, int targetStart, int len){
167         int sourceEnd = sourceStart + len;
168         int delta = targetStart - sourceStart;
169         for (int i = sourceStart; i < sourceEnd; i++) {
170             if (source[i] != target[i + delta])
171                 return false;
172         }
173         return true;
174     }
175 
176     /**
177      * Trivial reference equality.
178      * This method should help document that we really want == not equals(),
179      * and to have a single place to suppress warnings from static analysis tools.
180      */
sameObjects(Object a, Object b)181     public static final boolean sameObjects(Object a, Object b) {
182         return a == b;
183     }
184 
185     /**
186      * Convenience utility. Does null checks on objects, then calls compare.
187      */
checkCompare(T a, T b)188     public static <T extends Comparable<T>> int checkCompare(T a, T b) {
189         return a == null ?
190                 b == null ? 0 : -1 :
191                     b == null ? 1 : a.compareTo(b);
192       }
193 
194     /**
195      * Convenience utility. Does null checks on object, then calls hashCode.
196      */
checkHash(Object a)197     public static int checkHash(Object a) {
198         return a == null ? 0 : a.hashCode();
199       }
200 
201     /**
202      * The ESCAPE character is used during run-length encoding.  It signals
203      * a run of identical chars.
204      */
205     private static final char ESCAPE = '\uA5A5';
206 
207     /**
208      * The ESCAPE_BYTE character is used during run-length encoding.  It signals
209      * a run of identical bytes.
210      */
211     static final byte ESCAPE_BYTE = (byte)0xA5;
212 
213     /**
214      * Construct a string representing an int array.  Use run-length encoding.
215      * A character represents itself, unless it is the ESCAPE character.  Then
216      * the following notations are possible:
217      *   ESCAPE ESCAPE   ESCAPE literal
218      *   ESCAPE n c      n instances of character c
219      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
220      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
221      * If we encounter a run where n == ESCAPE, we represent this as:
222      *   c ESCAPE n-1 c
223      * The ESCAPE value is chosen so as not to collide with commonly
224      * seen values.
225      */
arrayToRLEString(int[] a)226     static public final String arrayToRLEString(int[] a) {
227         StringBuilder buffer = new StringBuilder();
228 
229         appendInt(buffer, a.length);
230         int runValue = a[0];
231         int runLength = 1;
232         for (int i=1; i<a.length; ++i) {
233             int s = a[i];
234             if (s == runValue && runLength < 0xFFFF) {
235                 ++runLength;
236             } else {
237                 encodeRun(buffer, runValue, runLength);
238                 runValue = s;
239                 runLength = 1;
240             }
241         }
242         encodeRun(buffer, runValue, runLength);
243         return buffer.toString();
244     }
245 
246     /**
247      * Construct a string representing a short array.  Use run-length encoding.
248      * A character represents itself, unless it is the ESCAPE character.  Then
249      * the following notations are possible:
250      *   ESCAPE ESCAPE   ESCAPE literal
251      *   ESCAPE n c      n instances of character c
252      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
253      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
254      * If we encounter a run where n == ESCAPE, we represent this as:
255      *   c ESCAPE n-1 c
256      * The ESCAPE value is chosen so as not to collide with commonly
257      * seen values.
258      */
arrayToRLEString(short[] a)259     static public final String arrayToRLEString(short[] a) {
260         StringBuilder buffer = new StringBuilder();
261         // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]);
262         buffer.append((char) (a.length >> 16));
263         buffer.append((char) a.length);
264         short runValue = a[0];
265         int runLength = 1;
266         for (int i=1; i<a.length; ++i) {
267             short s = a[i];
268             if (s == runValue && runLength < 0xFFFF) ++runLength;
269             else {
270                 encodeRun(buffer, runValue, runLength);
271                 runValue = s;
272                 runLength = 1;
273             }
274         }
275         encodeRun(buffer, runValue, runLength);
276         return buffer.toString();
277     }
278 
279     /**
280      * Construct a string representing a char array.  Use run-length encoding.
281      * A character represents itself, unless it is the ESCAPE character.  Then
282      * the following notations are possible:
283      *   ESCAPE ESCAPE   ESCAPE literal
284      *   ESCAPE n c      n instances of character c
285      * Since an encoded run occupies 3 characters, we only encode runs of 4 or
286      * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
287      * If we encounter a run where n == ESCAPE, we represent this as:
288      *   c ESCAPE n-1 c
289      * The ESCAPE value is chosen so as not to collide with commonly
290      * seen values.
291      */
arrayToRLEString(char[] a)292     static public final String arrayToRLEString(char[] a) {
293         StringBuilder buffer = new StringBuilder();
294         buffer.append((char) (a.length >> 16));
295         buffer.append((char) a.length);
296         char runValue = a[0];
297         int runLength = 1;
298         for (int i=1; i<a.length; ++i) {
299             char s = a[i];
300             if (s == runValue && runLength < 0xFFFF) ++runLength;
301             else {
302                 encodeRun(buffer, (short)runValue, runLength);
303                 runValue = s;
304                 runLength = 1;
305             }
306         }
307         encodeRun(buffer, (short)runValue, runLength);
308         return buffer.toString();
309     }
310 
311     /**
312      * Construct a string representing a byte array.  Use run-length encoding.
313      * Two bytes are packed into a single char, with a single extra zero byte at
314      * the end if needed.  A byte represents itself, unless it is the
315      * ESCAPE_BYTE.  Then the following notations are possible:
316      *   ESCAPE_BYTE ESCAPE_BYTE   ESCAPE_BYTE literal
317      *   ESCAPE_BYTE n b           n instances of byte b
318      * Since an encoded run occupies 3 bytes, we only encode runs of 4 or
319      * more bytes.  Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF.
320      * If we encounter a run where n == ESCAPE_BYTE, we represent this as:
321      *   b ESCAPE_BYTE n-1 b
322      * The ESCAPE_BYTE value is chosen so as not to collide with commonly
323      * seen values.
324      */
arrayToRLEString(byte[] a)325     static public final String arrayToRLEString(byte[] a) {
326         StringBuilder buffer = new StringBuilder();
327         buffer.append((char) (a.length >> 16));
328         buffer.append((char) a.length);
329         byte runValue = a[0];
330         int runLength = 1;
331         byte[] state = new byte[2];
332         for (int i=1; i<a.length; ++i) {
333             byte b = a[i];
334             if (b == runValue && runLength < 0xFF) ++runLength;
335             else {
336                 encodeRun(buffer, runValue, runLength, state);
337                 runValue = b;
338                 runLength = 1;
339             }
340         }
341         encodeRun(buffer, runValue, runLength, state);
342 
343         // We must save the final byte, if there is one, by padding
344         // an extra zero.
345         if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state);
346 
347         return buffer.toString();
348     }
349 
350     /**
351      * Encode a run, possibly a degenerate run (of < 4 values).
352      * @param length The length of the run; must be > 0 && <= 0xFFFF.
353      */
encodeRun(T buffer, int value, int length)354     private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) {
355         if (length < 4) {
356             for (int j=0; j<length; ++j) {
357                 if (value == ESCAPE) {
358                     appendInt(buffer, value);
359                 }
360                 appendInt(buffer, value);
361             }
362         }
363         else {
364             if (length == ESCAPE) {
365                 if (value == ESCAPE) {
366                     appendInt(buffer, ESCAPE);
367                 }
368                 appendInt(buffer, value);
369                 --length;
370             }
371             appendInt(buffer, ESCAPE);
372             appendInt(buffer, length);
373             appendInt(buffer, value); // Don't need to escape this value
374         }
375     }
376 
appendInt(T buffer, int value)377     private static final <T extends Appendable> void appendInt(T buffer, int value) {
378         try {
379             buffer.append((char)(value >>> 16));
380             buffer.append((char)(value & 0xFFFF));
381         } catch (IOException e) {
382             throw new IllegalIcuArgumentException(e);
383         }
384     }
385 
386     /**
387      * Encode a run, possibly a degenerate run (of < 4 values).
388      * @param length The length of the run; must be > 0 && <= 0xFFFF.
389      */
encodeRun(T buffer, short value, int length)390     private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) {
391         try {
392             char valueChar = (char) value;
393             if (length < 4) {
394                 for (int j=0; j<length; ++j) {
395                     if (valueChar == ESCAPE) {
396                         buffer.append(ESCAPE);
397                     }
398                     buffer.append(valueChar);
399                 }
400             }
401             else {
402                 if (length == ESCAPE) {
403                     if (valueChar == ESCAPE) {
404                         buffer.append(ESCAPE);
405                     }
406                     buffer.append(valueChar);
407                     --length;
408                 }
409                 buffer.append(ESCAPE);
410                 buffer.append((char) length);
411                 buffer.append(valueChar); // Don't need to escape this value
412             }
413         } catch (IOException e) {
414             throw new IllegalIcuArgumentException(e);
415         }
416     }
417 
418     /**
419      * Encode a run, possibly a degenerate run (of < 4 values).
420      * @param length The length of the run; must be > 0 && <= 0xFF.
421      */
encodeRun(T buffer, byte value, int length, byte[] state)422     private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length,
423             byte[] state) {
424         if (length < 4) {
425             for (int j=0; j<length; ++j) {
426                 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
427                 appendEncodedByte(buffer, value, state);
428             }
429         }
430         else {
431             if ((byte)length == ESCAPE_BYTE) {
432                 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
433                 appendEncodedByte(buffer, value, state);
434                 --length;
435             }
436             appendEncodedByte(buffer, ESCAPE_BYTE, state);
437             appendEncodedByte(buffer, (byte)length, state);
438             appendEncodedByte(buffer, value, state); // Don't need to escape this value
439         }
440     }
441 
442     /**
443      * Append a byte to the given Appendable, packing two bytes into each
444      * character.  The state parameter maintains intermediary data between
445      * calls.
446      * @param state A two-element array, with state[0] == 0 if this is the
447      * first byte of a pair, or state[0] != 0 if this is the second byte
448      * of a pair, in which case state[1] is the first byte.
449      */
appendEncodedByte(T buffer, byte value, byte[] state)450     private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value,
451             byte[] state) {
452         try {
453             if (state[0] != 0) {
454                 char c = (char) ((state[1] << 8) | ((value) & 0xFF));
455                 buffer.append(c);
456                 state[0] = 0;
457             }
458             else {
459                 state[0] = 1;
460                 state[1] = value;
461             }
462         } catch (IOException e) {
463             throw new IllegalIcuArgumentException(e);
464         }
465     }
466 
467     /**
468      * Construct an array of ints from a run-length encoded string.
469      */
RLEStringToIntArray(String s)470     static public final int[] RLEStringToIntArray(String s) {
471         int length = getInt(s, 0);
472         int[] array = new int[length];
473         int ai = 0, i = 1;
474 
475         int maxI = s.length() / 2;
476         while (ai < length && i < maxI) {
477             int c = getInt(s, i++);
478 
479             if (c == ESCAPE) {
480                 c = getInt(s, i++);
481                 if (c == ESCAPE) {
482                     array[ai++] = c;
483                 } else {
484                     int runLength = c;
485                     int runValue = getInt(s, i++);
486                     for (int j=0; j<runLength; ++j) {
487                         array[ai++] = runValue;
488                     }
489                 }
490             }
491             else {
492                 array[ai++] = c;
493             }
494         }
495 
496         if (ai != length || i != maxI) {
497             throw new IllegalStateException("Bad run-length encoded int array");
498         }
499 
500         return array;
501     }
getInt(String s, int i)502     static final int getInt(String s, int i) {
503         return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1);
504     }
505 
506     /**
507      * Construct an array of shorts from a run-length encoded string.
508      */
RLEStringToShortArray(String s)509     static public final short[] RLEStringToShortArray(String s) {
510         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
511         short[] array = new short[length];
512         int ai = 0;
513         for (int i=2; i<s.length(); ++i) {
514             char c = s.charAt(i);
515             if (c == ESCAPE) {
516                 c = s.charAt(++i);
517                 if (c == ESCAPE) {
518                     array[ai++] = (short) c;
519                 } else {
520                     int runLength = c;
521                     short runValue = (short) s.charAt(++i);
522                     for (int j=0; j<runLength; ++j) array[ai++] = runValue;
523                 }
524             }
525             else {
526                 array[ai++] = (short) c;
527             }
528         }
529 
530         if (ai != length)
531             throw new IllegalStateException("Bad run-length encoded short array");
532 
533         return array;
534     }
535 
536     /**
537      * Construct an array of shorts from a run-length encoded string.
538      */
RLEStringToCharArray(String s)539     static public final char[] RLEStringToCharArray(String s) {
540         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
541         char[] array = new char[length];
542         int ai = 0;
543         for (int i=2; i<s.length(); ++i) {
544             char c = s.charAt(i);
545             if (c == ESCAPE) {
546                 c = s.charAt(++i);
547                 if (c == ESCAPE) {
548                     array[ai++] = c;
549                 } else {
550                     int runLength = c;
551                     char runValue = s.charAt(++i);
552                     for (int j=0; j<runLength; ++j) array[ai++] = runValue;
553                 }
554             }
555             else {
556                 array[ai++] = c;
557             }
558         }
559 
560         if (ai != length)
561             throw new IllegalStateException("Bad run-length encoded short array");
562 
563         return array;
564     }
565 
566     /**
567      * Construct an array of bytes from a run-length encoded string.
568      */
RLEStringToByteArray(String s)569     static public final byte[] RLEStringToByteArray(String s) {
570         int length = ((s.charAt(0)) << 16) | (s.charAt(1));
571         byte[] array = new byte[length];
572         boolean nextChar = true;
573         char c = 0;
574         int node = 0;
575         int runLength = 0;
576         int i = 2;
577         for (int ai=0; ai<length; ) {
578             // This part of the loop places the next byte into the local
579             // variable 'b' each time through the loop.  It keeps the
580             // current character in 'c' and uses the boolean 'nextChar'
581             // to see if we've taken both bytes out of 'c' yet.
582             byte b;
583             if (nextChar) {
584                 c = s.charAt(i++);
585                 b = (byte) (c >> 8);
586                 nextChar = false;
587             }
588             else {
589                 b = (byte) (c & 0xFF);
590                 nextChar = true;
591             }
592 
593             // This part of the loop is a tiny state machine which handles
594             // the parsing of the run-length encoding.  This would be simpler
595             // if we could look ahead, but we can't, so we use 'node' to
596             // move between three nodes in the state machine.
597             switch (node) {
598             case 0:
599                 // Normal idle node
600                 if (b == ESCAPE_BYTE) {
601                     node = 1;
602                 }
603                 else {
604                     array[ai++] = b;
605                 }
606                 break;
607             case 1:
608                 // We have seen one ESCAPE_BYTE; we expect either a second
609                 // one, or a run length and value.
610                 if (b == ESCAPE_BYTE) {
611                     array[ai++] = ESCAPE_BYTE;
612                     node = 0;
613                 }
614                 else {
615                     runLength = b;
616                     // Interpret signed byte as unsigned
617                     if (runLength < 0) runLength += 0x100;
618                     node = 2;
619                 }
620                 break;
621             case 2:
622                 // We have seen an ESCAPE_BYTE and length byte.  We interpret
623                 // the next byte as the value to be repeated.
624                 for (int j=0; j<runLength; ++j) array[ai++] = b;
625                 node = 0;
626                 break;
627             }
628         }
629 
630         if (node != 0)
631             throw new IllegalStateException("Bad run-length encoded byte array");
632 
633         if (i != s.length())
634             throw new IllegalStateException("Excess data in RLE byte array string");
635 
636         return array;
637     }
638 
639     static public String LINE_SEPARATOR = System.getProperty("line.separator");
640 
641     /**
642      * Format a String for representation in a source file.  This includes
643      * breaking it into lines and escaping characters using octal notation
644      * when necessary (control characters and double quotes).
645      */
formatForSource(String s)646     static public final String formatForSource(String s) {
647         StringBuilder buffer = new StringBuilder();
648         for (int i=0; i<s.length();) {
649             if (i > 0) buffer.append('+').append(LINE_SEPARATOR);
650             buffer.append("        \"");
651             int count = 11;
652             while (i<s.length() && count<80) {
653                 char c = s.charAt(i++);
654                 if (c < '\u0020' || c == '"' || c == '\\') {
655                     if (c == '\n') {
656                         buffer.append("\\n");
657                         count += 2;
658                     } else if (c == '\t') {
659                         buffer.append("\\t");
660                         count += 2;
661                     } else if (c == '\r') {
662                         buffer.append("\\r");
663                         count += 2;
664                     } else {
665                         // Represent control characters, backslash and double quote
666                         // using octal notation; otherwise the string we form
667                         // won't compile, since Unicode escape sequences are
668                         // processed before tokenization.
669                         buffer.append('\\');
670                         buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
671                         buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
672                         buffer.append(HEX_DIGIT[(c & 0007)]);
673                         count += 4;
674                     }
675                 }
676                 else if (c <= '\u007E') {
677                     buffer.append(c);
678                     count += 1;
679                 }
680                 else {
681                     buffer.append("\\u");
682                     buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
683                     buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
684                     buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
685                     buffer.append(HEX_DIGIT[(c & 0x000F)]);
686                     count += 6;
687                 }
688             }
689             buffer.append('"');
690         }
691         return buffer.toString();
692     }
693 
694     static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7',
695         '8','9','A','B','C','D','E','F'};
696 
697     /**
698      * Format a String for representation in a source file.  Like
699      * formatForSource but does not do line breaking.
700      */
format1ForSource(String s)701     static public final String format1ForSource(String s) {
702         StringBuilder buffer = new StringBuilder();
703         buffer.append("\"");
704         for (int i=0; i<s.length();) {
705             char c = s.charAt(i++);
706             if (c < '\u0020' || c == '"' || c == '\\') {
707                 if (c == '\n') {
708                     buffer.append("\\n");
709                 } else if (c == '\t') {
710                     buffer.append("\\t");
711                 } else if (c == '\r') {
712                     buffer.append("\\r");
713                 } else {
714                     // Represent control characters, backslash and double quote
715                     // using octal notation; otherwise the string we form
716                     // won't compile, since Unicode escape sequences are
717                     // processed before tokenization.
718                     buffer.append('\\');
719                     buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
720                     buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
721                     buffer.append(HEX_DIGIT[(c & 0007)]);
722                 }
723             }
724             else if (c <= '\u007E') {
725                 buffer.append(c);
726             }
727             else {
728                 buffer.append("\\u");
729                 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
730                 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
731                 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
732                 buffer.append(HEX_DIGIT[(c & 0x000F)]);
733             }
734         }
735         buffer.append('"');
736         return buffer.toString();
737     }
738 
739     /**
740      * Convert characters outside the range U+0020 to U+007F to
741      * Unicode escapes, and convert backslash to a double backslash.
742      */
escape(String s)743     public static final String escape(String s) {
744         StringBuilder buf = new StringBuilder();
745         for (int i=0; i<s.length(); ) {
746             int c = Character.codePointAt(s, i);
747             i += UTF16.getCharCount(c);
748             if (c >= ' ' && c <= 0x007F) {
749                 if (c == '\\') {
750                     buf.append("\\\\"); // That is, "\\"
751                 } else {
752                     buf.append((char)c);
753                 }
754             } else {
755                 boolean four = c <= 0xFFFF;
756                 buf.append(four ? "\\u" : "\\U");
757                 buf.append(hex(c, four ? 4 : 8));
758             }
759         }
760         return buf.toString();
761     }
762 
763     /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
764     static private final char[] UNESCAPE_MAP = {
765         /*"   0x22, 0x22 */
766         /*'   0x27, 0x27 */
767         /*?   0x3F, 0x3F */
768         /*\   0x5C, 0x5C */
769         /*a*/ 0x61, 0x07,
770         /*b*/ 0x62, 0x08,
771         /*e*/ 0x65, 0x1b,
772         /*f*/ 0x66, 0x0c,
773         /*n*/ 0x6E, 0x0a,
774         /*r*/ 0x72, 0x0d,
775         /*t*/ 0x74, 0x09,
776         /*v*/ 0x76, 0x0b
777     };
778 
779     /**
780      * Convert an escape to a 32-bit code point value.  We attempt
781      * to parallel the icu4c unescapeAt() function.
782      * @param offset16 an array containing offset to the character
783      * <em>after</em> the backslash.  Upon return offset16[0] will
784      * be updated to point after the escape sequence.
785      * @return character value from 0 to 10FFFF, or -1 on error.
786      */
unescapeAt(String s, int[] offset16)787     public static int unescapeAt(String s, int[] offset16) {
788         int c;
789         int result = 0;
790         int n = 0;
791         int minDig = 0;
792         int maxDig = 0;
793         int bitsPerDigit = 4;
794         int dig;
795         int i;
796         boolean braces = false;
797 
798         /* Check that offset is in range */
799         int offset = offset16[0];
800         int length = s.length();
801         if (offset < 0 || offset >= length) {
802             return -1;
803         }
804 
805         /* Fetch first UChar after '\\' */
806         c = Character.codePointAt(s, offset);
807         offset += UTF16.getCharCount(c);
808 
809         /* Convert hexadecimal and octal escapes */
810         switch (c) {
811         case 'u':
812             minDig = maxDig = 4;
813             break;
814         case 'U':
815             minDig = maxDig = 8;
816             break;
817         case 'x':
818             minDig = 1;
819             if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
820                 ++offset;
821                 braces = true;
822                 maxDig = 8;
823             } else {
824                 maxDig = 2;
825             }
826             break;
827         default:
828             dig = UCharacter.digit(c, 8);
829             if (dig >= 0) {
830                 minDig = 1;
831                 maxDig = 3;
832                 n = 1; /* Already have first octal digit */
833                 bitsPerDigit = 3;
834                 result = dig;
835             }
836             break;
837         }
838         if (minDig != 0) {
839             while (offset < length && n < maxDig) {
840                 c = UTF16.charAt(s, offset);
841                 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
842                 if (dig < 0) {
843                     break;
844                 }
845                 result = (result << bitsPerDigit) | dig;
846                 offset += UTF16.getCharCount(c);
847                 ++n;
848             }
849             if (n < minDig) {
850                 return -1;
851             }
852             if (braces) {
853                 if (c != 0x7D /*}*/) {
854                     return -1;
855                 }
856                 ++offset;
857             }
858             if (result < 0 || result >= 0x110000) {
859                 return -1;
860             }
861             // If an escape sequence specifies a lead surrogate, see
862             // if there is a trail surrogate after it, either as an
863             // escape or as a literal.  If so, join them up into a
864             // supplementary.
865             if (offset < length &&
866                     UTF16.isLeadSurrogate((char) result)) {
867                 int ahead = offset+1;
868                 c = s.charAt(offset); // [sic] get 16-bit code unit
869                 if (c == '\\' && ahead < length) {
870                     int o[] = new int[] { ahead };
871                     c = unescapeAt(s, o);
872                     ahead = o[0];
873                 }
874                 if (UTF16.isTrailSurrogate((char) c)) {
875                     offset = ahead;
876                     result = Character.toCodePoint((char) result, (char) c);
877                 }
878             }
879             offset16[0] = offset;
880             return result;
881         }
882 
883         /* Convert C-style escapes in table */
884         for (i=0; i<UNESCAPE_MAP.length; i+=2) {
885             if (c == UNESCAPE_MAP[i]) {
886                 offset16[0] = offset;
887                 return UNESCAPE_MAP[i+1];
888             } else if (c < UNESCAPE_MAP[i]) {
889                 break;
890             }
891         }
892 
893         /* Map \cX to control-X: X & 0x1F */
894         if (c == 'c' && offset < length) {
895             c = UTF16.charAt(s, offset);
896             offset16[0] = offset + UTF16.getCharCount(c);
897             return 0x1F & c;
898         }
899 
900         /* If no special forms are recognized, then consider
901          * the backslash to generically escape the next character. */
902         offset16[0] = offset;
903         return c;
904     }
905 
906     /**
907      * Convert all escapes in a given string using unescapeAt().
908      * @exception IllegalArgumentException if an invalid escape is
909      * seen.
910      */
unescape(String s)911     public static String unescape(String s) {
912         StringBuilder buf = new StringBuilder();
913         int[] pos = new int[1];
914         for (int i=0; i<s.length(); ) {
915             char c = s.charAt(i++);
916             if (c == '\\') {
917                 pos[0] = i;
918                 int e = unescapeAt(s, pos);
919                 if (e < 0) {
920                     throw new IllegalArgumentException("Invalid escape sequence " +
921                             s.substring(i-1, Math.min(i+8, s.length())));
922                 }
923                 buf.appendCodePoint(e);
924                 i = pos[0];
925             } else {
926                 buf.append(c);
927             }
928         }
929         return buf.toString();
930     }
931 
932     /**
933      * Convert all escapes in a given string using unescapeAt().
934      * Leave invalid escape sequences unchanged.
935      */
unescapeLeniently(String s)936     public static String unescapeLeniently(String s) {
937         StringBuilder buf = new StringBuilder();
938         int[] pos = new int[1];
939         for (int i=0; i<s.length(); ) {
940             char c = s.charAt(i++);
941             if (c == '\\') {
942                 pos[0] = i;
943                 int e = unescapeAt(s, pos);
944                 if (e < 0) {
945                     buf.append(c);
946                 } else {
947                     buf.appendCodePoint(e);
948                     i = pos[0];
949                 }
950             } else {
951                 buf.append(c);
952             }
953         }
954         return buf.toString();
955     }
956 
957     /**
958      * Convert a char to 4 hex uppercase digits.  E.g., hex('a') =>
959      * "0041".
960      */
hex(long ch)961     public static String hex(long ch) {
962         return hex(ch, 4);
963     }
964 
965     /**
966      * Supplies a zero-padded hex representation of an integer (without 0x)
967      */
hex(long i, int places)968     static public String hex(long i, int places) {
969         if (i == Long.MIN_VALUE) return "-8000000000000000";
970         boolean negative = i < 0;
971         if (negative) {
972             i = -i;
973         }
974         String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
975         if (result.length() < places) {
976             result = "0000000000000000".substring(result.length(),places) + result;
977         }
978         if (negative) {
979             return '-' + result;
980         }
981         return result;
982     }
983 
984     /**
985      * Convert a string to comma-separated groups of 4 hex uppercase
986      * digits.  E.g., hex('ab') => "0041,0042".
987      */
988     public static String hex(CharSequence s) {
989         return hex(s, 4, ",", true, new StringBuilder()).toString();
990     }
991 
992     /**
993      * Convert a string to separated groups of hex uppercase
994      * digits.  E.g., hex('ab'...) => "0041,0042".  Append the output
995      * to the given Appendable.
996      */
997     public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) {
998         try {
999             if (useCodePoints) {
1000                 int cp;
1001                 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1002                     cp = Character.codePointAt(s, i);
1003                     if (i != 0) {
1004                         result.append(separator);
1005                     }
1006                     result.append(hex(cp,width));
1007                 }
1008             } else {
1009                 for (int i = 0; i < s.length(); ++i) {
1010                     if (i != 0) {
1011                         result.append(separator);
1012                     }
1013                     result.append(hex(s.charAt(i),width));
1014                 }
1015             }
1016             return result;
1017         } catch (IOException e) {
1018             throw new IllegalIcuArgumentException(e);
1019         }
1020     }
1021 
1022     public static String hex(byte[] o, int start, int end, String separator) {
1023         StringBuilder result = new StringBuilder();
1024         //int ch;
1025         for (int i = start; i < end; ++i) {
1026           if (i != 0) result.append(separator);
1027           result.append(hex(o[i]));
1028         }
1029         return result.toString();
1030       }
1031 
1032     /**
1033      * Convert a string to comma-separated groups of 4 hex uppercase
1034      * digits.  E.g., hex('ab') => "0041,0042".
1035      */
1036     public static <S extends CharSequence> String hex(S s, int width, S separator) {
1037         return hex(s, width, separator, true, new StringBuilder()).toString();
1038     }
1039 
1040     /**
1041      * Split a string into pieces based on the given divider character
1042      * @param s the string to split
1043      * @param divider the character on which to split.  Occurrences of
1044      * this character are not included in the output
1045      * @param output an array to receive the substrings between
1046      * instances of divider.  It must be large enough on entry to
1047      * accomodate all output.  Adjacent instances of the divider
1048      * character will place empty strings into output.  Before
1049      * returning, output is padded out with empty strings.
1050      */
1051     public static void split(String s, char divider, String[] output) {
1052         int last = 0;
1053         int current = 0;
1054         int i;
1055         for (i = 0; i < s.length(); ++i) {
1056             if (s.charAt(i) == divider) {
1057                 output[current++] = s.substring(last,i);
1058                 last = i+1;
1059             }
1060         }
1061         output[current++] = s.substring(last,i);
1062         while (current < output.length) {
1063             output[current++] = "";
1064         }
1065     }
1066 
1067     /**
1068      * Split a string into pieces based on the given divider character
1069      * @param s the string to split
1070      * @param divider the character on which to split.  Occurrences of
1071      * this character are not included in the output
1072      * @return output an array to receive the substrings between
1073      * instances of divider. Adjacent instances of the divider
1074      * character will place empty strings into output.
1075      */
1076     public static String[] split(String s, char divider) {
1077         int last = 0;
1078         int i;
1079         ArrayList<String> output = new ArrayList<>();
1080         for (i = 0; i < s.length(); ++i) {
1081             if (s.charAt(i) == divider) {
1082                 output.add(s.substring(last,i));
1083                 last = i+1;
1084             }
1085         }
1086         output.add( s.substring(last,i));
1087         return output.toArray(new String[output.size()]);
1088     }
1089 
1090     /**
1091      * Look up a given string in a string array.  Returns the index at
1092      * which the first occurrence of the string was found in the
1093      * array, or -1 if it was not found.
1094      * @param source the string to search for
1095      * @param target the array of zero or more strings in which to
1096      * look for source
1097      * @return the index of target at which source first occurs, or -1
1098      * if not found
1099      */
1100     public static int lookup(String source, String[] target) {
1101         for (int i = 0; i < target.length; ++i) {
1102             if (source.equals(target[i])) return i;
1103         }
1104         return -1;
1105     }
1106 
1107     /**
1108      * Parse a single non-whitespace character 'ch', optionally
1109      * preceded by whitespace.
1110      * @param id the string to be parsed
1111      * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
1112      * offset of the first character to be parsed.  On output, pos[0]
1113      * is the index after the last parsed character.  If the parse
1114      * fails, pos[0] will be unchanged.
1115      * @param ch the non-whitespace character to be parsed.
1116      * @return true if 'ch' is seen preceded by zero or more
1117      * whitespace characters.
1118      */
1119     public static boolean parseChar(String id, int[] pos, char ch) {
1120         int start = pos[0];
1121         pos[0] = PatternProps.skipWhiteSpace(id, pos[0]);
1122         if (pos[0] == id.length() ||
1123                 id.charAt(pos[0]) != ch) {
1124             pos[0] = start;
1125             return false;
1126         }
1127         ++pos[0];
1128         return true;
1129     }
1130 
1131     /**
1132      * Parse a pattern string starting at offset pos.  Keywords are
1133      * matched case-insensitively.  Spaces may be skipped and may be
1134      * optional or required.  Integer values may be parsed, and if
1135      * they are, they will be returned in the given array.  If
1136      * successful, the offset of the next non-space character is
1137      * returned.  On failure, -1 is returned.
1138      * @param pattern must only contain lowercase characters, which
1139      * will match their uppercase equivalents as well.  A space
1140      * character matches one or more required spaces.  A '~' character
1141      * matches zero or more optional spaces.  A '#' character matches
1142      * an integer and stores it in parsedInts, which the caller must
1143      * ensure has enough capacity.
1144      * @param parsedInts array to receive parsed integers.  Caller
1145      * must ensure that parsedInts.length is >= the number of '#'
1146      * signs in 'pattern'.
1147      * @return the position after the last character parsed, or -1 if
1148      * the parse failed
1149      */
1150     @SuppressWarnings("fallthrough")
1151     public static int parsePattern(String rule, int pos, int limit,
1152             String pattern, int[] parsedInts) {
1153         // TODO Update this to handle surrogates
1154         int[] p = new int[1];
1155         int intCount = 0; // number of integers parsed
1156         for (int i=0; i<pattern.length(); ++i) {
1157             char cpat = pattern.charAt(i);
1158             char c;
1159             switch (cpat) {
1160             case ' ':
1161                 if (pos >= limit) {
1162                     return -1;
1163                 }
1164                 c = rule.charAt(pos++);
1165                 if (!PatternProps.isWhiteSpace(c)) {
1166                     return -1;
1167                 }
1168                 // FALL THROUGH to skipWhitespace
1169             case '~':
1170                 pos = PatternProps.skipWhiteSpace(rule, pos);
1171                 break;
1172             case '#':
1173                 p[0] = pos;
1174                 parsedInts[intCount++] = parseInteger(rule, p, limit);
1175                 if (p[0] == pos) {
1176                     // Syntax error; failed to parse integer
1177                     return -1;
1178                 }
1179                 pos = p[0];
1180                 break;
1181             default:
1182                 if (pos >= limit) {
1183                     return -1;
1184                 }
1185                 c = (char) UCharacter.toLowerCase(rule.charAt(pos++));
1186                 if (c != cpat) {
1187                     return -1;
1188                 }
1189                 break;
1190             }
1191         }
1192         return pos;
1193     }
1194 
1195     /**
1196      * Parse a pattern string within the given Replaceable and a parsing
1197      * pattern.  Characters are matched literally and case-sensitively
1198      * except for the following special characters:
1199      *
1200      * ~  zero or more Pattern_White_Space chars
1201      *
1202      * If end of pattern is reached with all matches along the way,
1203      * pos is advanced to the first unparsed index and returned.
1204      * Otherwise -1 is returned.
1205      * @param pat pattern that controls parsing
1206      * @param text text to be parsed, starting at index
1207      * @param index offset to first character to parse
1208      * @param limit offset after last character to parse
1209      * @return index after last parsed character, or -1 on parse failure.
1210      */
1211     public static int parsePattern(String pat,
1212             Replaceable text,
1213             int index,
1214             int limit) {
1215         int ipat = 0;
1216 
1217         // empty pattern matches immediately
1218         if (ipat == pat.length()) {
1219             return index;
1220         }
1221 
1222         int cpat = Character.codePointAt(pat, ipat);
1223 
1224         while (index < limit) {
1225             int c = text.char32At(index);
1226 
1227             // parse \s*
1228             if (cpat == '~') {
1229                 if (PatternProps.isWhiteSpace(c)) {
1230                     index += UTF16.getCharCount(c);
1231                     continue;
1232                 } else {
1233                     if (++ipat == pat.length()) {
1234                         return index; // success; c unparsed
1235                     }
1236                     // fall thru; process c again with next cpat
1237                 }
1238             }
1239 
1240             // parse literal
1241             else if (c == cpat) {
1242                 int n = UTF16.getCharCount(c);
1243                 index += n;
1244                 ipat += n;
1245                 if (ipat == pat.length()) {
1246                     return index; // success; c parsed
1247                 }
1248                 // fall thru; get next cpat
1249             }
1250 
1251             // match failure of literal
1252             else {
1253                 return -1;
1254             }
1255 
1256             cpat = UTF16.charAt(pat, ipat);
1257         }
1258 
1259         return -1; // text ended before end of pat
1260     }
1261 
1262     /**
1263      * Parse an integer at pos, either of the form \d+ or of the form
1264      * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
1265      * or octal format.
1266      * @param pos INPUT-OUTPUT parameter.  On input, the first
1267      * character to parse.  On output, the character after the last
1268      * parsed character.
1269      */
1270     public static int parseInteger(String rule, int[] pos, int limit) {
1271         int count = 0;
1272         int value = 0;
1273         int p = pos[0];
1274         int radix = 10;
1275 
1276         if (rule.regionMatches(true, p, "0x", 0, 2)) {
1277             p += 2;
1278             radix = 16;
1279         } else if (p < limit && rule.charAt(p) == '0') {
1280             p++;
1281             count = 1;
1282             radix = 8;
1283         }
1284 
1285         while (p < limit) {
1286             int d = UCharacter.digit(rule.charAt(p++), radix);
1287             if (d < 0) {
1288                 --p;
1289                 break;
1290             }
1291             ++count;
1292             int v = (value * radix) + d;
1293             if (v <= value) {
1294                 // If there are too many input digits, at some point
1295                 // the value will go negative, e.g., if we have seen
1296                 // "0x8000000" already and there is another '0', when
1297                 // we parse the next 0 the value will go negative.
1298                 return 0;
1299             }
1300             value = v;
1301         }
1302         if (count > 0) {
1303             pos[0] = p;
1304         }
1305         return value;
1306     }
1307 
1308     /**
1309      * Parse a Unicode identifier from the given string at the given
1310      * position.  Return the identifier, or null if there is no
1311      * identifier.
1312      * @param str the string to parse
1313      * @param pos INPUT-OUPUT parameter.  On INPUT, pos[0] is the
1314      * first character to examine.  It must be less than str.length(),
1315      * and it must not point to a whitespace character.  That is, must
1316      * have pos[0] < str.length().  On
1317      * OUTPUT, the position after the last parsed character.
1318      * @return the Unicode identifier, or null if there is no valid
1319      * identifier at pos[0].
1320      */
1321     public static String parseUnicodeIdentifier(String str, int[] pos) {
1322         // assert(pos[0] < str.length());
1323         StringBuilder buf = new StringBuilder();
1324         int p = pos[0];
1325         while (p < str.length()) {
1326             int ch = Character.codePointAt(str, p);
1327             if (buf.length() == 0) {
1328                 if (UCharacter.isUnicodeIdentifierStart(ch)) {
1329                     buf.appendCodePoint(ch);
1330                 } else {
1331                     return null;
1332                 }
1333             } else {
1334                 if (UCharacter.isUnicodeIdentifierPart(ch)) {
1335                     buf.appendCodePoint(ch);
1336                 } else {
1337                     break;
1338                 }
1339             }
1340             p += UTF16.getCharCount(ch);
1341         }
1342         pos[0] = p;
1343         return buf.toString();
1344     }
1345 
1346     static final char DIGITS[] = {
1347         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
1348         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
1349         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
1350         'U', 'V', 'W', 'X', 'Y', 'Z'
1351     };
1352 
1353     /**
1354      * Append the digits of a positive integer to the given
1355      * <code>Appendable</code> in the given radix. This is
1356      * done recursively since it is easiest to generate the low-
1357      * order digit first, but it must be appended last.
1358      *
1359      * @param result is the <code>Appendable</code> to append to
1360      * @param n is the positive integer
1361      * @param radix is the radix, from 2 to 36 inclusive
1362      * @param minDigits is the minimum number of digits to append.
1363      */
1364     private static <T extends Appendable> void recursiveAppendNumber(T result, int n,
1365             int radix, int minDigits)
1366     {
1367         try {
1368             int digit = n % radix;
1369 
1370             if (n >= radix || minDigits > 1) {
1371                 recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
1372             }
1373             result.append(DIGITS[digit]);
1374         } catch (IOException e) {
1375             throw new IllegalIcuArgumentException(e);
1376         }
1377     }
1378 
1379     /**
1380      * Append a number to the given Appendable in the given radix.
1381      * Standard digits '0'-'9' are used and letters 'A'-'Z' for
1382      * radices 11 through 36.
1383      * @param result the digits of the number are appended here
1384      * @param n the number to be converted to digits; may be negative.
1385      * If negative, a '-' is prepended to the digits.
1386      * @param radix a radix from 2 to 36 inclusive.
1387      * @param minDigits the minimum number of digits, not including
1388      * any '-', to produce.  Values less than 2 have no effect.  One
1389      * digit is always emitted regardless of this parameter.
1390      * @return a reference to result
1391      */
1392     public static <T extends Appendable> T appendNumber(T result, int n,
1393             int radix, int minDigits)
1394     {
1395         try {
1396             if (radix < 2 || radix > 36) {
1397                 throw new IllegalArgumentException("Illegal radix " + radix);
1398             }
1399 
1400 
1401             int abs = n;
1402 
1403             if (n < 0) {
1404                 abs = -n;
1405                 result.append("-");
1406             }
1407 
1408             recursiveAppendNumber(result, abs, radix, minDigits);
1409 
1410             return result;
1411         } catch (IOException e) {
1412             throw new IllegalIcuArgumentException(e);
1413         }
1414 
1415     }
1416 
1417     /**
1418      * Parse an unsigned 31-bit integer at the given offset.  Use
1419      * UCharacter.digit() to parse individual characters into digits.
1420      * @param text the text to be parsed
1421      * @param pos INPUT-OUTPUT parameter.  On entry, pos[0] is the
1422      * offset within text at which to start parsing; it should point
1423      * to a valid digit.  On exit, pos[0] is the offset after the last
1424      * parsed character.  If the parse failed, it will be unchanged on
1425      * exit.  Must be >= 0 on entry.
1426      * @param radix the radix in which to parse; must be >= 2 and <=
1427      * 36.
1428      * @return a non-negative parsed number, or -1 upon parse failure.
1429      * Parse fails if there are no digits, that is, if pos[0] does not
1430      * point to a valid digit on entry, or if the number to be parsed
1431      * does not fit into a 31-bit unsigned integer.
1432      */
1433     public static int parseNumber(String text, int[] pos, int radix) {
1434         // assert(pos[0] >= 0);
1435         // assert(radix >= 2);
1436         // assert(radix <= 36);
1437         int n = 0;
1438         int p = pos[0];
1439         while (p < text.length()) {
1440             int ch = Character.codePointAt(text, p);
1441             int d = UCharacter.digit(ch, radix);
1442             if (d < 0) {
1443                 break;
1444             }
1445             n = radix*n + d;
1446             // ASSUME that when a 32-bit integer overflows it becomes
1447             // negative.  E.g., 214748364 * 10 + 8 => negative value.
1448             if (n < 0) {
1449                 return -1;
1450             }
1451             ++p;
1452         }
1453         if (p == pos[0]) {
1454             return -1;
1455         }
1456         pos[0] = p;
1457         return n;
1458     }
1459 
1460     /**
1461      * Return true if the character is NOT printable ASCII.  The tab,
1462      * newline and linefeed characters are considered unprintable.
1463      */
1464     public static boolean isUnprintable(int c) {
1465         //0x20 = 32 and 0x7E = 126
1466         return !(c >= 0x20 && c <= 0x7E);
1467     }
1468 
1469     /**
1470      * Escape unprintable characters using <backslash>uxxxx notation
1471      * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
1472      * above.  If the character is printable ASCII, then do nothing
1473      * and return FALSE.  Otherwise, append the escaped notation and
1474      * return TRUE.
1475      */
1476     public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
1477         try {
1478             if (isUnprintable(c)) {
1479                 result.append('\\');
1480                 if ((c & ~0xFFFF) != 0) {
1481                     result.append('U');
1482                     result.append(DIGITS[0xF&(c>>28)]);
1483                     result.append(DIGITS[0xF&(c>>24)]);
1484                     result.append(DIGITS[0xF&(c>>20)]);
1485                     result.append(DIGITS[0xF&(c>>16)]);
1486                 } else {
1487                     result.append('u');
1488                 }
1489                 result.append(DIGITS[0xF&(c>>12)]);
1490                 result.append(DIGITS[0xF&(c>>8)]);
1491                 result.append(DIGITS[0xF&(c>>4)]);
1492                 result.append(DIGITS[0xF&c]);
1493                 return true;
1494             }
1495             return false;
1496         } catch (IOException e) {
1497             throw new IllegalIcuArgumentException(e);
1498         }
1499     }
1500 
1501     /**
1502      * Returns the index of the first character in a set, ignoring quoted text.
1503      * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
1504      * found by a search for "h".  Unlike String.indexOf(), this method searches
1505      * not for a single character, but for any character of the string
1506      * <code>setOfChars</code>.
1507      * @param text text to be searched
1508      * @param start the beginning index, inclusive; <code>0 <= start
1509      * <= limit</code>.
1510      * @param limit the ending index, exclusive; <code>start <= limit
1511      * <= text.length()</code>.
1512      * @param setOfChars string with one or more distinct characters
1513      * @return Offset of the first character in <code>setOfChars</code>
1514      * found, or -1 if not found.
1515      * @see String#indexOf
1516      */
1517     public static int quotedIndexOf(String text, int start, int limit,
1518             String setOfChars) {
1519         for (int i=start; i<limit; ++i) {
1520             char c = text.charAt(i);
1521             if (c == BACKSLASH) {
1522                 ++i;
1523             } else if (c == APOSTROPHE) {
1524                 while (++i < limit
1525                         && text.charAt(i) != APOSTROPHE) {}
1526             } else if (setOfChars.indexOf(c) >= 0) {
1527                 return i;
1528             }
1529         }
1530         return -1;
1531     }
1532 
1533     /**
1534      * Append a character to a rule that is being built up.  To flush
1535      * the quoteBuf to rule, make one final call with isLiteral == true.
1536      * If there is no final character, pass in (int)-1 as c.
1537      * @param rule the string to append the character to
1538      * @param c the character to append, or (int)-1 if none.
1539      * @param isLiteral if true, then the given character should not be
1540      * quoted or escaped.  Usually this means it is a syntactic element
1541      * such as > or $
1542      * @param escapeUnprintable if true, then unprintable characters
1543      * should be escaped using escapeUnprintable().  These escapes will
1544      * appear outside of quotes.
1545      * @param quoteBuf a buffer which is used to build up quoted
1546      * substrings.  The caller should initially supply an empty buffer,
1547      * and thereafter should not modify the buffer.  The buffer should be
1548      * cleared out by, at the end, calling this method with a literal
1549      * character (which may be -1).
1550      */
1551     public static void appendToRule(StringBuffer rule,
1552             int c,
1553             boolean isLiteral,
1554             boolean escapeUnprintable,
1555             StringBuffer quoteBuf) {
1556         // If we are escaping unprintables, then escape them outside
1557         // quotes.  \\u and \\U are not recognized within quotes.  The same
1558         // logic applies to literals, but literals are never escaped.
1559         if (isLiteral ||
1560                 (escapeUnprintable && Utility.isUnprintable(c))) {
1561             if (quoteBuf.length() > 0) {
1562                 // We prefer backslash APOSTROPHE to double APOSTROPHE
1563                 // (more readable, less similar to ") so if there are
1564                 // double APOSTROPHEs at the ends, we pull them outside
1565                 // of the quote.
1566 
1567                 // If the first thing in the quoteBuf is APOSTROPHE
1568                 // (doubled) then pull it out.
1569                 while (quoteBuf.length() >= 2 &&
1570                         quoteBuf.charAt(0) == APOSTROPHE &&
1571                         quoteBuf.charAt(1) == APOSTROPHE) {
1572                     rule.append(BACKSLASH).append(APOSTROPHE);
1573                     quoteBuf.delete(0, 2);
1574                 }
1575                 // If the last thing in the quoteBuf is APOSTROPHE
1576                 // (doubled) then remove and count it and add it after.
1577                 int trailingCount = 0;
1578                 while (quoteBuf.length() >= 2 &&
1579                         quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
1580                         quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
1581                     quoteBuf.setLength(quoteBuf.length()-2);
1582                     ++trailingCount;
1583                 }
1584                 if (quoteBuf.length() > 0) {
1585                     rule.append(APOSTROPHE);
1586                     rule.append(quoteBuf);
1587                     rule.append(APOSTROPHE);
1588                     quoteBuf.setLength(0);
1589                 }
1590                 while (trailingCount-- > 0) {
1591                     rule.append(BACKSLASH).append(APOSTROPHE);
1592                 }
1593             }
1594             if (c != -1) {
1595                 /* Since spaces are ignored during parsing, they are
1596                  * emitted only for readability.  We emit one here
1597                  * only if there isn't already one at the end of the
1598                  * rule.
1599                  */
1600                 if (c == ' ') {
1601                     int len = rule.length();
1602                     if (len > 0 && rule.charAt(len-1) != ' ') {
1603                         rule.append(' ');
1604                     }
1605                 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) {
1606                     rule.appendCodePoint(c);
1607                 }
1608             }
1609         }
1610 
1611         // Escape ' and '\' and don't begin a quote just for them
1612         else if (quoteBuf.length() == 0 &&
1613                 (c == APOSTROPHE || c == BACKSLASH)) {
1614             rule.append(BACKSLASH).append((char)c);
1615         }
1616 
1617         // Specials (printable ascii that isn't [0-9a-zA-Z]) and
1618         // whitespace need quoting.  Also append stuff to quotes if we are
1619         // building up a quoted substring already.
1620         else if (quoteBuf.length() > 0 ||
1621                 (c >= 0x0021 && c <= 0x007E &&
1622                         !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
1623                                 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
1624                                 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
1625                                 PatternProps.isWhiteSpace(c)) {
1626             quoteBuf.appendCodePoint(c);
1627             // Double ' within a quote
1628             if (c == APOSTROPHE) {
1629                 quoteBuf.append((char)c);
1630             }
1631         }
1632 
1633         // Otherwise just append
1634         else {
1635             rule.appendCodePoint(c);
1636         }
1637     }
1638 
1639     /**
1640      * Append the given string to the rule.  Calls the single-character
1641      * version of appendToRule for each character.
1642      */
1643     public static void appendToRule(StringBuffer rule,
1644             String text,
1645             boolean isLiteral,
1646             boolean escapeUnprintable,
1647             StringBuffer quoteBuf) {
1648         for (int i=0; i<text.length(); ++i) {
1649             // Okay to process in 16-bit code units here
1650             appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
1651         }
1652     }
1653 
1654     /**
1655      * Given a matcher reference, which may be null, append its
1656      * pattern as a literal to the given rule.
1657      */
1658     public static void appendToRule(StringBuffer rule,
1659             UnicodeMatcher matcher,
1660             boolean escapeUnprintable,
1661             StringBuffer quoteBuf) {
1662         if (matcher != null) {
1663             appendToRule(rule, matcher.toPattern(escapeUnprintable),
1664                     true, escapeUnprintable, quoteBuf);
1665         }
1666     }
1667 
1668     /**
1669      * Compares 2 unsigned integers
1670      * @param source 32 bit unsigned integer
1671      * @param target 32 bit unsigned integer
1672      * @return 0 if equals, 1 if source is greater than target and -1
1673      *         otherwise
1674      */
1675     public static final int compareUnsigned(int source, int target)
1676     {
1677         source += MAGIC_UNSIGNED;
1678         target += MAGIC_UNSIGNED;
1679         if (source < target) {
1680             return -1;
1681         }
1682         else if (source > target) {
1683             return 1;
1684         }
1685         return 0;
1686     }
1687 
1688     /**
1689      * Find the highest bit in a positive integer. This is done
1690      * by doing a binary search through the bits.
1691      *
1692      * @param n is the integer
1693      *
1694      * @return the bit number of the highest bit, with 0 being
1695      * the low order bit, or -1 if <code>n</code> is not positive
1696      */
1697     public static final byte highBit(int n)
1698     {
1699         if (n <= 0) {
1700             return -1;
1701         }
1702 
1703         byte bit = 0;
1704 
1705         if (n >= 1 << 16) {
1706             n >>= 16;
1707         bit += 16;
1708         }
1709 
1710         if (n >= 1 << 8) {
1711             n >>= 8;
1712         bit += 8;
1713         }
1714 
1715         if (n >= 1 << 4) {
1716             n >>= 4;
1717         bit += 4;
1718         }
1719 
1720         if (n >= 1 << 2) {
1721             n >>= 2;
1722         bit += 2;
1723         }
1724 
1725         if (n >= 1 << 1) {
1726             n >>= 1;
1727         bit += 1;
1728         }
1729 
1730         return bit;
1731     }
1732     /**
1733      * Utility method to take a int[] containing codepoints and return
1734      * a string representation with code units.
1735      */
1736     public static String valueOf(int[]source){
1737         // TODO: Investigate why this method is not on UTF16 class
1738         StringBuilder result = new StringBuilder(source.length);
1739         for(int i=0; i<source.length; i++){
1740             result.appendCodePoint(source[i]);
1741         }
1742         return result.toString();
1743     }
1744 
1745 
1746     /**
1747      * Utility to duplicate a string count times
1748      * @param s String to be duplicated.
1749      * @param count Number of times to duplicate a string.
1750      */
1751     public static String repeat(String s, int count) {
1752         if (count <= 0) return "";
1753         if (count == 1) return s;
1754         StringBuilder result = new StringBuilder();
1755         for (int i = 0; i < count; ++i) {
1756             result.append(s);
1757         }
1758         return result.toString();
1759     }
1760 
1761     public static String[] splitString(String src, String target) {
1762         return src.split("\\Q" + target + "\\E");
1763     }
1764 
1765     /**
1766      * Split the string at runs of ascii whitespace characters.
1767      */
1768     public static String[] splitWhitespace(String src) {
1769         return src.split("\\s+");
1770     }
1771 
1772     /**
1773      * Parse a list of hex numbers and return a string
1774      * @param string String of hex numbers.
1775      * @param minLength Minimal length.
1776      * @param separator Separator.
1777      * @return A string from hex numbers.
1778      */
1779     public static String fromHex(String string, int minLength, String separator) {
1780         return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+"));
1781     }
1782 
1783     /**
1784      * Parse a list of hex numbers and return a string
1785      * @param string String of hex numbers.
1786      * @param minLength Minimal length.
1787      * @param separator Separator.
1788      * @return A string from hex numbers.
1789      */
1790     public static String fromHex(String string, int minLength, Pattern separator) {
1791         StringBuilder buffer = new StringBuilder();
1792         String[] parts = separator.split(string);
1793         for (String part : parts) {
1794             if (part.length() < minLength) {
1795                 throw new IllegalArgumentException("code point too short: " + part);
1796             }
1797             int cp = Integer.parseInt(part, 16);
1798             buffer.appendCodePoint(cp);
1799         }
1800         return buffer.toString();
1801     }
1802 
1803     /**
1804      * This implementation is equivalent to Java 8+ Math#addExact(int, int)
1805      * @param x the first value
1806      * @param y the second value
1807      * @return the result
1808      */
1809     public static int addExact(int x, int y) {
1810         int r = x + y;
1811         // HD 2-12 Overflow iff both arguments have the opposite sign of the result
1812         if (((x ^ r) & (y ^ r)) < 0) {
1813             throw new ArithmeticException("integer overflow");
1814         }
1815         return r;
1816     }
1817 
1818     /**
1819      * Returns whether the chars in the two CharSequences are equal.
1820      */
1821     public static boolean charSequenceEquals(CharSequence a, CharSequence b) {
1822         if (a == b) {
1823             return true;
1824         }
1825         if (a == null || b == null) {
1826             return false;
1827         }
1828         if (a.length() != b.length()) {
1829             return false;
1830         }
1831         for (int i = 0; i < a.length(); i++) {
1832             if (a.charAt(i) != b.charAt(i))
1833                 return false;
1834         }
1835         return true;
1836     }
1837 
1838     /**
1839      * Returns a hash code for a CharSequence that is equivalent to calling
1840      * charSequence.toString().hashCode()
1841      */
1842     public static int charSequenceHashCode(CharSequence value) {
1843         int hash = 0;
1844         for (int i = 0; i < value.length(); i++) {
1845             hash = hash * 31 + value.charAt(i);
1846         }
1847         return hash;
1848     }
1849 
1850     /**
1851      * Appends a CharSequence to an Appendable, converting IOException to ICUUncheckedIOException.
1852      */
1853     public static <A extends Appendable> A appendTo(CharSequence string, A appendable) {
1854         try {
1855             appendable.append(string);
1856             return appendable;
1857         } catch (IOException e) {
1858             throw new ICUUncheckedIOException(e);
1859         }
1860     }
1861 }
1862