1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import java.io.IOException; 12 import java.util.ArrayList; 13 import java.util.Iterator; 14 import java.util.Locale; 15 import java.util.regex.Pattern; 16 17 import com.ibm.icu.lang.UCharacter; 18 import com.ibm.icu.text.Replaceable; 19 import com.ibm.icu.text.UTF16; 20 import com.ibm.icu.text.UnicodeMatcher; 21 import com.ibm.icu.util.ICUUncheckedIOException; 22 23 public final class Utility { 24 25 private static final char APOSTROPHE = '\''; 26 private static final char BACKSLASH = '\\'; 27 private static final int MAGIC_UNSIGNED = 0x80000000; 28 29 /** 30 * Convenience utility to compare two Object[]s. 31 * Ought to be in System 32 */ arrayEquals(Object[] source, Object target)33 public final static boolean arrayEquals(Object[] source, Object target) { 34 if (source == null) return (target == null); 35 if (!(target instanceof Object[])) return false; 36 Object[] targ = (Object[]) target; 37 return (source.length == targ.length 38 && arrayRegionMatches(source, 0, targ, 0, source.length)); 39 } 40 41 /** 42 * Convenience utility to compare two int[]s 43 * Ought to be in System 44 */ arrayEquals(int[] source, Object target)45 public final static boolean arrayEquals(int[] source, Object target) { 46 if (source == null) return (target == null); 47 if (!(target instanceof int[])) return false; 48 int[] targ = (int[]) target; 49 return (source.length == targ.length 50 && arrayRegionMatches(source, 0, targ, 0, source.length)); 51 } 52 53 /** 54 * Convenience utility to compare two double[]s 55 * Ought to be in System 56 */ arrayEquals(double[] source, Object target)57 public final static boolean arrayEquals(double[] source, Object target) { 58 if (source == null) return (target == null); 59 if (!(target instanceof double[])) return false; 60 double[] targ = (double[]) target; 61 return (source.length == targ.length 62 && arrayRegionMatches(source, 0, targ, 0, source.length)); 63 } arrayEquals(byte[] source, Object target)64 public final static boolean arrayEquals(byte[] source, Object target) { 65 if (source == null) return (target == null); 66 if (!(target instanceof byte[])) return false; 67 byte[] targ = (byte[]) target; 68 return (source.length == targ.length 69 && arrayRegionMatches(source, 0, targ, 0, source.length)); 70 } 71 72 /** 73 * Convenience utility to compare two Object[]s 74 * Ought to be in System 75 */ arrayEquals(Object source, Object target)76 public final static boolean arrayEquals(Object source, Object target) { 77 if (source == null) return (target == null); 78 // for some reason, the correct arrayEquals is not being called 79 // so do it by hand for now. 80 if (source instanceof Object[]) 81 return(arrayEquals((Object[]) source,target)); 82 if (source instanceof int[]) 83 return(arrayEquals((int[]) source,target)); 84 if (source instanceof double[]) 85 return(arrayEquals((double[]) source, target)); 86 if (source instanceof byte[]) 87 return(arrayEquals((byte[]) source,target)); 88 return source.equals(target); 89 } 90 91 /** 92 * Convenience utility to compare two Object[]s 93 * Ought to be in System. 94 * @param len the length to compare. 95 * The start indices and start+len must be valid. 96 */ arrayRegionMatches(Object[] source, int sourceStart, Object[] target, int targetStart, int len)97 public final static boolean arrayRegionMatches(Object[] source, int sourceStart, 98 Object[] target, int targetStart, 99 int len) 100 { 101 int sourceEnd = sourceStart + len; 102 int delta = targetStart - sourceStart; 103 for (int i = sourceStart; i < sourceEnd; i++) { 104 if (!arrayEquals(source[i],target[i + delta])) 105 return false; 106 } 107 return true; 108 } 109 110 /** 111 * Convenience utility to compare two Object[]s 112 * Ought to be in System. 113 * @param len the length to compare. 114 * The start indices and start+len must be valid. 115 */ arrayRegionMatches(char[] source, int sourceStart, char[] target, int targetStart, int len)116 public final static boolean arrayRegionMatches(char[] source, int sourceStart, 117 char[] target, int targetStart, 118 int len) 119 { 120 int sourceEnd = sourceStart + len; 121 int delta = targetStart - sourceStart; 122 for (int i = sourceStart; i < sourceEnd; i++) { 123 if (source[i]!=target[i + delta]) 124 return false; 125 } 126 return true; 127 } 128 129 /** 130 * Convenience utility to compare two int[]s. 131 * @param len the length to compare. 132 * The start indices and start+len must be valid. 133 * Ought to be in System 134 */ arrayRegionMatches(int[] source, int sourceStart, int[] target, int targetStart, int len)135 public final static boolean arrayRegionMatches(int[] source, int sourceStart, 136 int[] target, int targetStart, 137 int len) 138 { 139 int sourceEnd = sourceStart + len; 140 int delta = targetStart - sourceStart; 141 for (int i = sourceStart; i < sourceEnd; i++) { 142 if (source[i] != target[i + delta]) 143 return false; 144 } 145 return true; 146 } 147 148 /** 149 * Convenience utility to compare two arrays of doubles. 150 * @param len the length to compare. 151 * The start indices and start+len must be valid. 152 * Ought to be in System 153 */ arrayRegionMatches(double[] source, int sourceStart, double[] target, int targetStart, int len)154 public final static boolean arrayRegionMatches(double[] source, int sourceStart, 155 double[] target, int targetStart, 156 int len) 157 { 158 int sourceEnd = sourceStart + len; 159 int delta = targetStart - sourceStart; 160 for (int i = sourceStart; i < sourceEnd; i++) { 161 if (source[i] != target[i + delta]) 162 return false; 163 } 164 return true; 165 } arrayRegionMatches(byte[] source, int sourceStart, byte[] target, int targetStart, int len)166 public final static boolean arrayRegionMatches(byte[] source, int sourceStart, 167 byte[] target, int targetStart, int len){ 168 int sourceEnd = sourceStart + len; 169 int delta = targetStart - sourceStart; 170 for (int i = sourceStart; i < sourceEnd; i++) { 171 if (source[i] != target[i + delta]) 172 return false; 173 } 174 return true; 175 } 176 177 /** 178 * Trivial reference equality. 179 * This method should help document that we really want == not equals(), 180 * and to have a single place to suppress warnings from static analysis tools. 181 */ sameObjects(Object a, Object b)182 public static final boolean sameObjects(Object a, Object b) { 183 return a == b; 184 } 185 186 /** 187 * Convenience utility. Does null checks on objects, then calls compare. 188 */ checkCompare(T a, T b)189 public static <T extends Comparable<T>> int checkCompare(T a, T b) { 190 return a == null ? 191 b == null ? 0 : -1 : 192 b == null ? 1 : a.compareTo(b); 193 } 194 195 /** 196 * Convenience utility. Does null checks on object, then calls hashCode. 197 */ checkHash(Object a)198 public static int checkHash(Object a) { 199 return a == null ? 0 : a.hashCode(); 200 } 201 202 /** 203 * The ESCAPE character is used during run-length encoding. It signals 204 * a run of identical chars. 205 */ 206 private static final char ESCAPE = '\uA5A5'; 207 208 /** 209 * The ESCAPE_BYTE character is used during run-length encoding. It signals 210 * a run of identical bytes. 211 */ 212 static final byte ESCAPE_BYTE = (byte)0xA5; 213 214 /** 215 * Construct a string representing an int array. Use run-length encoding. 216 * A character represents itself, unless it is the ESCAPE character. Then 217 * the following notations are possible: 218 * ESCAPE ESCAPE ESCAPE literal 219 * ESCAPE n c n instances of character c 220 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 221 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 222 * If we encounter a run where n == ESCAPE, we represent this as: 223 * c ESCAPE n-1 c 224 * The ESCAPE value is chosen so as not to collide with commonly 225 * seen values. 226 */ arrayToRLEString(int[] a)227 static public final String arrayToRLEString(int[] a) { 228 StringBuilder buffer = new StringBuilder(); 229 230 appendInt(buffer, a.length); 231 int runValue = a[0]; 232 int runLength = 1; 233 for (int i=1; i<a.length; ++i) { 234 int s = a[i]; 235 if (s == runValue && runLength < 0xFFFF) { 236 ++runLength; 237 } else { 238 encodeRun(buffer, runValue, runLength); 239 runValue = s; 240 runLength = 1; 241 } 242 } 243 encodeRun(buffer, runValue, runLength); 244 return buffer.toString(); 245 } 246 247 /** 248 * Construct a string representing a short array. Use run-length encoding. 249 * A character represents itself, unless it is the ESCAPE character. Then 250 * the following notations are possible: 251 * ESCAPE ESCAPE ESCAPE literal 252 * ESCAPE n c n instances of character c 253 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 254 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 255 * If we encounter a run where n == ESCAPE, we represent this as: 256 * c ESCAPE n-1 c 257 * The ESCAPE value is chosen so as not to collide with commonly 258 * seen values. 259 */ arrayToRLEString(short[] a)260 static public final String arrayToRLEString(short[] a) { 261 StringBuilder buffer = new StringBuilder(); 262 // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]); 263 buffer.append((char) (a.length >> 16)); 264 buffer.append((char) a.length); 265 short runValue = a[0]; 266 int runLength = 1; 267 for (int i=1; i<a.length; ++i) { 268 short s = a[i]; 269 if (s == runValue && runLength < 0xFFFF) ++runLength; 270 else { 271 encodeRun(buffer, runValue, runLength); 272 runValue = s; 273 runLength = 1; 274 } 275 } 276 encodeRun(buffer, runValue, runLength); 277 return buffer.toString(); 278 } 279 280 /** 281 * Construct a string representing a char array. Use run-length encoding. 282 * A character represents itself, unless it is the ESCAPE character. Then 283 * the following notations are possible: 284 * ESCAPE ESCAPE ESCAPE literal 285 * ESCAPE n c n instances of character c 286 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 287 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 288 * If we encounter a run where n == ESCAPE, we represent this as: 289 * c ESCAPE n-1 c 290 * The ESCAPE value is chosen so as not to collide with commonly 291 * seen values. 292 */ arrayToRLEString(char[] a)293 static public final String arrayToRLEString(char[] a) { 294 StringBuilder buffer = new StringBuilder(); 295 buffer.append((char) (a.length >> 16)); 296 buffer.append((char) a.length); 297 char runValue = a[0]; 298 int runLength = 1; 299 for (int i=1; i<a.length; ++i) { 300 char s = a[i]; 301 if (s == runValue && runLength < 0xFFFF) ++runLength; 302 else { 303 encodeRun(buffer, (short)runValue, runLength); 304 runValue = s; 305 runLength = 1; 306 } 307 } 308 encodeRun(buffer, (short)runValue, runLength); 309 return buffer.toString(); 310 } 311 312 /** 313 * Construct a string representing a byte array. Use run-length encoding. 314 * Two bytes are packed into a single char, with a single extra zero byte at 315 * the end if needed. A byte represents itself, unless it is the 316 * ESCAPE_BYTE. Then the following notations are possible: 317 * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal 318 * ESCAPE_BYTE n b n instances of byte b 319 * Since an encoded run occupies 3 bytes, we only encode runs of 4 or 320 * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF. 321 * If we encounter a run where n == ESCAPE_BYTE, we represent this as: 322 * b ESCAPE_BYTE n-1 b 323 * The ESCAPE_BYTE value is chosen so as not to collide with commonly 324 * seen values. 325 */ arrayToRLEString(byte[] a)326 static public final String arrayToRLEString(byte[] a) { 327 StringBuilder buffer = new StringBuilder(); 328 buffer.append((char) (a.length >> 16)); 329 buffer.append((char) a.length); 330 byte runValue = a[0]; 331 int runLength = 1; 332 byte[] state = new byte[2]; 333 for (int i=1; i<a.length; ++i) { 334 byte b = a[i]; 335 if (b == runValue && runLength < 0xFF) ++runLength; 336 else { 337 encodeRun(buffer, runValue, runLength, state); 338 runValue = b; 339 runLength = 1; 340 } 341 } 342 encodeRun(buffer, runValue, runLength, state); 343 344 // We must save the final byte, if there is one, by padding 345 // an extra zero. 346 if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state); 347 348 return buffer.toString(); 349 } 350 351 /** 352 * Encode a run, possibly a degenerate run (of < 4 values). 353 * @param length The length of the run; must be > 0 && <= 0xFFFF. 354 */ encodeRun(T buffer, int value, int length)355 private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) { 356 if (length < 4) { 357 for (int j=0; j<length; ++j) { 358 if (value == ESCAPE) { 359 appendInt(buffer, value); 360 } 361 appendInt(buffer, value); 362 } 363 } 364 else { 365 if (length == ESCAPE) { 366 if (value == ESCAPE) { 367 appendInt(buffer, ESCAPE); 368 } 369 appendInt(buffer, value); 370 --length; 371 } 372 appendInt(buffer, ESCAPE); 373 appendInt(buffer, length); 374 appendInt(buffer, value); // Don't need to escape this value 375 } 376 } 377 appendInt(T buffer, int value)378 private static final <T extends Appendable> void appendInt(T buffer, int value) { 379 try { 380 buffer.append((char)(value >>> 16)); 381 buffer.append((char)(value & 0xFFFF)); 382 } catch (IOException e) { 383 throw new IllegalIcuArgumentException(e); 384 } 385 } 386 387 /** 388 * Encode a run, possibly a degenerate run (of < 4 values). 389 * @param length The length of the run; must be > 0 && <= 0xFFFF. 390 */ encodeRun(T buffer, short value, int length)391 private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) { 392 try { 393 char valueChar = (char) value; 394 if (length < 4) { 395 for (int j=0; j<length; ++j) { 396 if (valueChar == ESCAPE) { 397 buffer.append(ESCAPE); 398 } 399 buffer.append(valueChar); 400 } 401 } 402 else { 403 if (length == ESCAPE) { 404 if (valueChar == ESCAPE) { 405 buffer.append(ESCAPE); 406 } 407 buffer.append(valueChar); 408 --length; 409 } 410 buffer.append(ESCAPE); 411 buffer.append((char) length); 412 buffer.append(valueChar); // Don't need to escape this value 413 } 414 } catch (IOException e) { 415 throw new IllegalIcuArgumentException(e); 416 } 417 } 418 419 /** 420 * Encode a run, possibly a degenerate run (of < 4 values). 421 * @param length The length of the run; must be > 0 && <= 0xFF. 422 */ encodeRun(T buffer, byte value, int length, byte[] state)423 private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length, 424 byte[] state) { 425 if (length < 4) { 426 for (int j=0; j<length; ++j) { 427 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 428 appendEncodedByte(buffer, value, state); 429 } 430 } 431 else { 432 if ((byte)length == ESCAPE_BYTE) { 433 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 434 appendEncodedByte(buffer, value, state); 435 --length; 436 } 437 appendEncodedByte(buffer, ESCAPE_BYTE, state); 438 appendEncodedByte(buffer, (byte)length, state); 439 appendEncodedByte(buffer, value, state); // Don't need to escape this value 440 } 441 } 442 443 /** 444 * Append a byte to the given Appendable, packing two bytes into each 445 * character. The state parameter maintains intermediary data between 446 * calls. 447 * @param state A two-element array, with state[0] == 0 if this is the 448 * first byte of a pair, or state[0] != 0 if this is the second byte 449 * of a pair, in which case state[1] is the first byte. 450 */ appendEncodedByte(T buffer, byte value, byte[] state)451 private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value, 452 byte[] state) { 453 try { 454 if (state[0] != 0) { 455 char c = (char) ((state[1] << 8) | ((value) & 0xFF)); 456 buffer.append(c); 457 state[0] = 0; 458 } 459 else { 460 state[0] = 1; 461 state[1] = value; 462 } 463 } catch (IOException e) { 464 throw new IllegalIcuArgumentException(e); 465 } 466 } 467 468 /** 469 * Construct an array of ints from a run-length encoded string. 470 */ RLEStringToIntArray(String s)471 static public final int[] RLEStringToIntArray(String s) { 472 int length = getInt(s, 0); 473 int[] array = new int[length]; 474 int ai = 0, i = 1; 475 476 int maxI = s.length() / 2; 477 while (ai < length && i < maxI) { 478 int c = getInt(s, i++); 479 480 if (c == ESCAPE) { 481 c = getInt(s, i++); 482 if (c == ESCAPE) { 483 array[ai++] = c; 484 } else { 485 int runLength = c; 486 int runValue = getInt(s, i++); 487 for (int j=0; j<runLength; ++j) { 488 array[ai++] = runValue; 489 } 490 } 491 } 492 else { 493 array[ai++] = c; 494 } 495 } 496 497 if (ai != length || i != maxI) { 498 throw new IllegalStateException("Bad run-length encoded int array"); 499 } 500 501 return array; 502 } getInt(String s, int i)503 static final int getInt(String s, int i) { 504 return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1); 505 } 506 507 /** 508 * Construct an array of shorts from a run-length encoded string. 509 */ RLEStringToShortArray(String s)510 static public final short[] RLEStringToShortArray(String s) { 511 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 512 short[] array = new short[length]; 513 int ai = 0; 514 for (int i=2; i<s.length(); ++i) { 515 char c = s.charAt(i); 516 if (c == ESCAPE) { 517 c = s.charAt(++i); 518 if (c == ESCAPE) { 519 array[ai++] = (short) c; 520 } else { 521 int runLength = c; 522 short runValue = (short) s.charAt(++i); 523 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 524 } 525 } 526 else { 527 array[ai++] = (short) c; 528 } 529 } 530 531 if (ai != length) 532 throw new IllegalStateException("Bad run-length encoded short array"); 533 534 return array; 535 } 536 537 /** 538 * Construct an array of shorts from a run-length encoded string. 539 */ RLEStringToCharArray(String s)540 static public final char[] RLEStringToCharArray(String s) { 541 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 542 char[] array = new char[length]; 543 int ai = 0; 544 for (int i=2; i<s.length(); ++i) { 545 char c = s.charAt(i); 546 if (c == ESCAPE) { 547 c = s.charAt(++i); 548 if (c == ESCAPE) { 549 array[ai++] = c; 550 } else { 551 int runLength = c; 552 char runValue = s.charAt(++i); 553 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 554 } 555 } 556 else { 557 array[ai++] = c; 558 } 559 } 560 561 if (ai != length) 562 throw new IllegalStateException("Bad run-length encoded short array"); 563 564 return array; 565 } 566 567 /** 568 * Construct an array of bytes from a run-length encoded string. 569 */ RLEStringToByteArray(String s)570 static public final byte[] RLEStringToByteArray(String s) { 571 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 572 byte[] array = new byte[length]; 573 boolean nextChar = true; 574 char c = 0; 575 int node = 0; 576 int runLength = 0; 577 int i = 2; 578 for (int ai=0; ai<length; ) { 579 // This part of the loop places the next byte into the local 580 // variable 'b' each time through the loop. It keeps the 581 // current character in 'c' and uses the boolean 'nextChar' 582 // to see if we've taken both bytes out of 'c' yet. 583 byte b; 584 if (nextChar) { 585 c = s.charAt(i++); 586 b = (byte) (c >> 8); 587 nextChar = false; 588 } 589 else { 590 b = (byte) (c & 0xFF); 591 nextChar = true; 592 } 593 594 // This part of the loop is a tiny state machine which handles 595 // the parsing of the run-length encoding. This would be simpler 596 // if we could look ahead, but we can't, so we use 'node' to 597 // move between three nodes in the state machine. 598 switch (node) { 599 case 0: 600 // Normal idle node 601 if (b == ESCAPE_BYTE) { 602 node = 1; 603 } 604 else { 605 array[ai++] = b; 606 } 607 break; 608 case 1: 609 // We have seen one ESCAPE_BYTE; we expect either a second 610 // one, or a run length and value. 611 if (b == ESCAPE_BYTE) { 612 array[ai++] = ESCAPE_BYTE; 613 node = 0; 614 } 615 else { 616 runLength = b; 617 // Interpret signed byte as unsigned 618 if (runLength < 0) runLength += 0x100; 619 node = 2; 620 } 621 break; 622 case 2: 623 // We have seen an ESCAPE_BYTE and length byte. We interpret 624 // the next byte as the value to be repeated. 625 for (int j=0; j<runLength; ++j) array[ai++] = b; 626 node = 0; 627 break; 628 } 629 } 630 631 if (node != 0) 632 throw new IllegalStateException("Bad run-length encoded byte array"); 633 634 if (i != s.length()) 635 throw new IllegalStateException("Excess data in RLE byte array string"); 636 637 return array; 638 } 639 640 static public String LINE_SEPARATOR = System.getProperty("line.separator"); 641 642 /** 643 * Format a String for representation in a source file. This includes 644 * breaking it into lines and escaping characters using octal notation 645 * when necessary (control characters and double quotes). 646 */ formatForSource(String s)647 static public final String formatForSource(String s) { 648 StringBuilder buffer = new StringBuilder(); 649 for (int i=0; i<s.length();) { 650 if (i > 0) buffer.append('+').append(LINE_SEPARATOR); 651 buffer.append(" \""); 652 int count = 11; 653 while (i<s.length() && count<80) { 654 char c = s.charAt(i++); 655 if (c < '\u0020' || c == '"' || c == '\\') { 656 if (c == '\n') { 657 buffer.append("\\n"); 658 count += 2; 659 } else if (c == '\t') { 660 buffer.append("\\t"); 661 count += 2; 662 } else if (c == '\r') { 663 buffer.append("\\r"); 664 count += 2; 665 } else { 666 // Represent control characters, backslash and double quote 667 // using octal notation; otherwise the string we form 668 // won't compile, since Unicode escape sequences are 669 // processed before tokenization. 670 buffer.append('\\'); 671 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 672 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 673 buffer.append(HEX_DIGIT[(c & 0007)]); 674 count += 4; 675 } 676 } 677 else if (c <= '\u007E') { 678 buffer.append(c); 679 count += 1; 680 } 681 else { 682 buffer.append("\\u"); 683 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 684 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 685 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 686 buffer.append(HEX_DIGIT[(c & 0x000F)]); 687 count += 6; 688 } 689 } 690 buffer.append('"'); 691 } 692 return buffer.toString(); 693 } 694 695 static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7', 696 '8','9','A','B','C','D','E','F'}; 697 698 /** 699 * Format a String for representation in a source file. Like 700 * formatForSource but does not do line breaking. 701 */ format1ForSource(String s)702 static public final String format1ForSource(String s) { 703 StringBuilder buffer = new StringBuilder(); 704 buffer.append("\""); 705 for (int i=0; i<s.length();) { 706 char c = s.charAt(i++); 707 if (c < '\u0020' || c == '"' || c == '\\') { 708 if (c == '\n') { 709 buffer.append("\\n"); 710 } else if (c == '\t') { 711 buffer.append("\\t"); 712 } else if (c == '\r') { 713 buffer.append("\\r"); 714 } else { 715 // Represent control characters, backslash and double quote 716 // using octal notation; otherwise the string we form 717 // won't compile, since Unicode escape sequences are 718 // processed before tokenization. 719 buffer.append('\\'); 720 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 721 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 722 buffer.append(HEX_DIGIT[(c & 0007)]); 723 } 724 } 725 else if (c <= '\u007E') { 726 buffer.append(c); 727 } 728 else { 729 buffer.append("\\u"); 730 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 731 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 732 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 733 buffer.append(HEX_DIGIT[(c & 0x000F)]); 734 } 735 } 736 buffer.append('"'); 737 return buffer.toString(); 738 } 739 740 /** 741 * Convert characters outside the range U+0020 to U+007F to 742 * Unicode escapes, and convert backslash to a double backslash. 743 */ escape(String s)744 public static final String escape(String s) { 745 StringBuilder buf = new StringBuilder(); 746 for (int i=0; i<s.length(); ) { 747 int c = Character.codePointAt(s, i); 748 i += UTF16.getCharCount(c); 749 if (c >= ' ' && c <= 0x007F) { 750 if (c == '\\') { 751 buf.append("\\\\"); // That is, "\\" 752 } else { 753 buf.append((char)c); 754 } 755 } else { 756 boolean four = c <= 0xFFFF; 757 buf.append(four ? "\\u" : "\\U"); 758 buf.append(hex(c, four ? 4 : 8)); 759 } 760 } 761 return buf.toString(); 762 } 763 764 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 765 static private final char[] UNESCAPE_MAP = { 766 /*" 0x22, 0x22 */ 767 /*' 0x27, 0x27 */ 768 /*? 0x3F, 0x3F */ 769 /*\ 0x5C, 0x5C */ 770 /*a*/ 0x61, 0x07, 771 /*b*/ 0x62, 0x08, 772 /*e*/ 0x65, 0x1b, 773 /*f*/ 0x66, 0x0c, 774 /*n*/ 0x6E, 0x0a, 775 /*r*/ 0x72, 0x0d, 776 /*t*/ 0x74, 0x09, 777 /*v*/ 0x76, 0x0b 778 }; 779 780 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */ _digit8(int c)781 private static final int _digit8(int c) { 782 if (c >= '0' && c <= '7') { 783 return c - '0'; 784 } 785 return -1; 786 } 787 788 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ _digit16(int c)789 private static final int _digit16(int c) { 790 if (c >= '0' && c <= '9') { 791 return c - '0'; 792 } 793 if (c >= 'A' && c <= 'F') { 794 return c - ('A' - 10); 795 } 796 if (c >= 'a' && c <= 'f') { 797 return c - ('a' - 10); 798 } 799 return -1; 800 } 801 802 /** 803 * Converts an escape to a code point value. We attempt 804 * to parallel the icu4c unescapeAt() function. 805 * This function returns an integer with 806 * both the code point (bits 28..8) and the length of the escape sequence (bits 7..0). 807 * offset+length is the index after the escape sequence. 808 * 809 * @param offset the offset to the character <em>after</em> the backslash. 810 * @return the code point and length, or -1 on error. 811 */ unescapeAndLengthAt(CharSequence s, int offset)812 public static int unescapeAndLengthAt(CharSequence s, int offset) { 813 return unescapeAndLengthAt(s, offset, s.length()); 814 } 815 unescapeAndLengthAt(CharSequence s, int offset, int length)816 private static int unescapeAndLengthAt(CharSequence s, int offset, int length) { 817 int result = 0; 818 int n = 0; 819 int minDig = 0; 820 int maxDig = 0; 821 int bitsPerDigit = 4; 822 int dig; 823 boolean braces = false; 824 825 /* Check that offset is in range */ 826 if (offset < 0 || offset >= length) { 827 return -1; 828 } 829 int start = offset; 830 831 /* Fetch first UChar after '\\' */ 832 int c = s.charAt(offset++); 833 834 /* Convert hexadecimal and octal escapes */ 835 switch (c) { 836 case 'u': 837 minDig = maxDig = 4; 838 break; 839 case 'U': 840 minDig = maxDig = 8; 841 break; 842 case 'x': 843 minDig = 1; 844 if (offset < length && s.charAt(offset) == '{') { 845 ++offset; 846 braces = true; 847 maxDig = 8; 848 } else { 849 maxDig = 2; 850 } 851 break; 852 default: 853 dig = _digit8(c); 854 if (dig >= 0) { 855 minDig = 1; 856 maxDig = 3; 857 n = 1; /* Already have first octal digit */ 858 bitsPerDigit = 3; 859 result = dig; 860 } 861 break; 862 } 863 if (minDig != 0) { 864 while (offset < length && n < maxDig) { 865 c = s.charAt(offset); 866 dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c); 867 if (dig < 0) { 868 break; 869 } 870 result = (result << bitsPerDigit) | dig; 871 ++offset; 872 ++n; 873 } 874 if (n < minDig) { 875 return -1; 876 } 877 if (braces) { 878 if (c != '}') { 879 return -1; 880 } 881 ++offset; 882 } 883 if (result < 0 || result >= 0x110000) { 884 return -1; 885 } 886 // If an escape sequence specifies a lead surrogate, see 887 // if there is a trail surrogate after it, either as an 888 // escape or as a literal. If so, join them up into a 889 // supplementary. 890 if (offset < length && UTF16.isLeadSurrogate(result)) { 891 int ahead = offset+1; 892 c = s.charAt(offset); 893 if (c == '\\' && ahead < length) { 894 // Calling ourselves recursively may cause a stack overflow if 895 // we have repeated escaped lead surrogates. 896 // Limit the length to 11 ("x{0000DFFF}") after ahead. 897 int tailLimit = ahead + 11; 898 if (tailLimit > length) { 899 tailLimit = length; 900 } 901 int cpAndLength = unescapeAndLengthAt(s, ahead, tailLimit); 902 if (cpAndLength >= 0) { 903 c = cpAndLength >> 8; 904 ahead += cpAndLength & 0xff; 905 } 906 } 907 if (UTF16.isTrailSurrogate(c)) { 908 offset = ahead; 909 result = UCharacter.toCodePoint(result, c); 910 } 911 } 912 return codePointAndLength(result, start, offset); 913 } 914 915 /* Convert C-style escapes in table */ 916 for (int i=0; i<UNESCAPE_MAP.length; i+=2) { 917 if (c == UNESCAPE_MAP[i]) { 918 return codePointAndLength(UNESCAPE_MAP[i+1], start, offset); 919 } else if (c < UNESCAPE_MAP[i]) { 920 break; 921 } 922 } 923 924 /* Map \cX to control-X: X & 0x1F */ 925 if (c == 'c' && offset < length) { 926 c = Character.codePointAt(s, offset); 927 return codePointAndLength(c & 0x1F, start, offset + Character.charCount(c)); 928 } 929 930 /* If no special forms are recognized, then consider 931 * the backslash to generically escape the next character. 932 * Deal with surrogate pairs. */ 933 if (UTF16.isLeadSurrogate(c) && offset < length) { 934 int c2 = s.charAt(offset); 935 if (UTF16.isTrailSurrogate(c2)) { 936 ++offset; 937 c = UCharacter.toCodePoint(c, c2); 938 } 939 } 940 return codePointAndLength(c, start, offset); 941 } 942 codePointAndLength(int c, int length)943 private static int codePointAndLength(int c, int length) { 944 assert 0 <= c && c <= 0x10ffff; 945 assert 0 <= length && length <= 0xff; 946 return c << 8 | length; 947 } 948 codePointAndLength(int c, int start, int limit)949 private static int codePointAndLength(int c, int start, int limit) { 950 return codePointAndLength(c, limit - start); 951 } 952 cpFromCodePointAndLength(int cpAndLength)953 public static int cpFromCodePointAndLength(int cpAndLength) { 954 assert cpAndLength >= 0; 955 return cpAndLength >> 8; 956 } 957 lengthFromCodePointAndLength(int cpAndLength)958 public static int lengthFromCodePointAndLength(int cpAndLength) { 959 assert cpAndLength >= 0; 960 return cpAndLength & 0xff; 961 } 962 963 /** 964 * Convert all escapes in a given string using unescapeAndLengthAt(). 965 * @exception IllegalArgumentException if an invalid escape is 966 * seen. 967 */ unescape(CharSequence s)968 public static String unescape(CharSequence s) { 969 StringBuilder buf = null; 970 for (int i=0; i<s.length(); ) { 971 char c = s.charAt(i++); 972 if (c == '\\') { 973 if (buf == null) { 974 buf = new StringBuilder(s.length()).append(s, 0, i - 1); 975 } 976 int cpAndLength = unescapeAndLengthAt(s, i); 977 if (cpAndLength < 0) { 978 throw new IllegalArgumentException("Invalid escape sequence " + 979 s.subSequence(i-1, Math.min(i+9, s.length()))); 980 } 981 buf.appendCodePoint(cpAndLength >> 8); 982 i += cpAndLength & 0xff; 983 } else if (buf != null) { 984 // We could optimize this further by appending whole substrings between escapes. 985 buf.append(c); 986 } 987 } 988 if (buf == null) { 989 // No escapes in s. 990 return s.toString(); 991 } 992 return buf.toString(); 993 } 994 995 /** 996 * Convert all escapes in a given string using unescapeAndLengthAt(). 997 * Leave invalid escape sequences unchanged. 998 */ unescapeLeniently(CharSequence s)999 public static String unescapeLeniently(CharSequence s) { 1000 StringBuilder buf = null; 1001 for (int i=0; i<s.length(); ) { 1002 char c = s.charAt(i++); 1003 if (c == '\\') { 1004 if (buf == null) { 1005 buf = new StringBuilder(s.length()).append(s, 0, i - 1); 1006 } 1007 int cpAndLength = unescapeAndLengthAt(s, i); 1008 if (cpAndLength < 0) { 1009 buf.append(c); 1010 } else { 1011 buf.appendCodePoint(cpAndLength >> 8); 1012 i += cpAndLength & 0xff; 1013 } 1014 } else if (buf != null) { 1015 // We could optimize this further by appending whole substrings between escapes. 1016 buf.append(c); 1017 } 1018 } 1019 if (buf == null) { 1020 // No escapes in s. 1021 return s.toString(); 1022 } 1023 return buf.toString(); 1024 } 1025 1026 /** 1027 * Convert a char to 4 hex uppercase digits. E.g., hex('a') => 1028 * "0041". 1029 */ hex(long ch)1030 public static String hex(long ch) { 1031 return hex(ch, 4); 1032 } 1033 1034 /** 1035 * Supplies a zero-padded hex representation of an integer (without 0x) 1036 */ hex(long i, int places)1037 static public String hex(long i, int places) { 1038 if (i == Long.MIN_VALUE) return "-8000000000000000"; 1039 boolean negative = i < 0; 1040 if (negative) { 1041 i = -i; 1042 } 1043 String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); 1044 if (result.length() < places) { 1045 result = "0000000000000000".substring(result.length(),places) + result; 1046 } 1047 if (negative) { 1048 return '-' + result; 1049 } 1050 return result; 1051 } 1052 1053 /** 1054 * Convert a string to comma-separated groups of 4 hex uppercase 1055 * digits. E.g., hex('ab') => "0041,0042". 1056 */ 1057 public static String hex(CharSequence s) { 1058 return hex(s, 4, ",", true, new StringBuilder()).toString(); 1059 } 1060 1061 /** 1062 * Convert a string to separated groups of hex uppercase 1063 * digits. E.g., hex('ab'...) => "0041,0042". Append the output 1064 * to the given Appendable. 1065 */ 1066 public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) { 1067 try { 1068 if (useCodePoints) { 1069 int cp; 1070 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1071 cp = Character.codePointAt(s, i); 1072 if (i != 0) { 1073 result.append(separator); 1074 } 1075 result.append(hex(cp,width)); 1076 } 1077 } else { 1078 for (int i = 0; i < s.length(); ++i) { 1079 if (i != 0) { 1080 result.append(separator); 1081 } 1082 result.append(hex(s.charAt(i),width)); 1083 } 1084 } 1085 return result; 1086 } catch (IOException e) { 1087 throw new IllegalIcuArgumentException(e); 1088 } 1089 } 1090 1091 public static String hex(byte[] o, int start, int end, String separator) { 1092 StringBuilder result = new StringBuilder(); 1093 //int ch; 1094 for (int i = start; i < end; ++i) { 1095 if (i != 0) result.append(separator); 1096 result.append(hex(o[i])); 1097 } 1098 return result.toString(); 1099 } 1100 1101 /** 1102 * Convert a string to comma-separated groups of 4 hex uppercase 1103 * digits. E.g., hex('ab') => "0041,0042". 1104 */ 1105 public static <S extends CharSequence> String hex(S s, int width, S separator) { 1106 return hex(s, width, separator, true, new StringBuilder()).toString(); 1107 } 1108 1109 /** 1110 * Split a string into pieces based on the given divider character 1111 * @param s the string to split 1112 * @param divider the character on which to split. Occurrences of 1113 * this character are not included in the output 1114 * @param output an array to receive the substrings between 1115 * instances of divider. It must be large enough on entry to 1116 * accommodate all output. Adjacent instances of the divider 1117 * character will place empty strings into output. Before 1118 * returning, output is padded out with empty strings. 1119 */ 1120 public static void split(String s, char divider, String[] output) { 1121 int last = 0; 1122 int current = 0; 1123 int i; 1124 for (i = 0; i < s.length(); ++i) { 1125 if (s.charAt(i) == divider) { 1126 output[current++] = s.substring(last,i); 1127 last = i+1; 1128 } 1129 } 1130 output[current++] = s.substring(last,i); 1131 while (current < output.length) { 1132 output[current++] = ""; 1133 } 1134 } 1135 1136 /** 1137 * Split a string into pieces based on the given divider character 1138 * @param s the string to split 1139 * @param divider the character on which to split. Occurrences of 1140 * this character are not included in the output 1141 * @return output an array to receive the substrings between 1142 * instances of divider. Adjacent instances of the divider 1143 * character will place empty strings into output. 1144 */ 1145 public static String[] split(String s, char divider) { 1146 int last = 0; 1147 int i; 1148 ArrayList<String> output = new ArrayList<>(); 1149 for (i = 0; i < s.length(); ++i) { 1150 if (s.charAt(i) == divider) { 1151 output.add(s.substring(last,i)); 1152 last = i+1; 1153 } 1154 } 1155 output.add( s.substring(last,i)); 1156 return output.toArray(new String[output.size()]); 1157 } 1158 1159 /** 1160 * Look up a given string in a string array. Returns the index at 1161 * which the first occurrence of the string was found in the 1162 * array, or -1 if it was not found. 1163 * @param source the string to search for 1164 * @param target the array of zero or more strings in which to 1165 * look for source 1166 * @return the index of target at which source first occurs, or -1 1167 * if not found 1168 */ 1169 public static int lookup(String source, String[] target) { 1170 for (int i = 0; i < target.length; ++i) { 1171 if (source.equals(target[i])) return i; 1172 } 1173 return -1; 1174 } 1175 1176 /** 1177 * Parse a single non-whitespace character 'ch', optionally 1178 * preceded by whitespace. 1179 * @param id the string to be parsed 1180 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 1181 * offset of the first character to be parsed. On output, pos[0] 1182 * is the index after the last parsed character. If the parse 1183 * fails, pos[0] will be unchanged. 1184 * @param ch the non-whitespace character to be parsed. 1185 * @return true if 'ch' is seen preceded by zero or more 1186 * whitespace characters. 1187 */ 1188 public static boolean parseChar(String id, int[] pos, char ch) { 1189 int start = pos[0]; 1190 pos[0] = PatternProps.skipWhiteSpace(id, pos[0]); 1191 if (pos[0] == id.length() || 1192 id.charAt(pos[0]) != ch) { 1193 pos[0] = start; 1194 return false; 1195 } 1196 ++pos[0]; 1197 return true; 1198 } 1199 1200 /** 1201 * Parse a pattern string starting at offset pos. Keywords are 1202 * matched case-insensitively. Spaces may be skipped and may be 1203 * optional or required. Integer values may be parsed, and if 1204 * they are, they will be returned in the given array. If 1205 * successful, the offset of the next non-space character is 1206 * returned. On failure, -1 is returned. 1207 * @param pattern must only contain lowercase characters, which 1208 * will match their uppercase equivalents as well. A space 1209 * character matches one or more required spaces. A '~' character 1210 * matches zero or more optional spaces. A '#' character matches 1211 * an integer and stores it in parsedInts, which the caller must 1212 * ensure has enough capacity. 1213 * @param parsedInts array to receive parsed integers. Caller 1214 * must ensure that parsedInts.length is >= the number of '#' 1215 * signs in 'pattern'. 1216 * @return the position after the last character parsed, or -1 if 1217 * the parse failed 1218 */ 1219 @SuppressWarnings("fallthrough") 1220 public static int parsePattern(String rule, int pos, int limit, 1221 String pattern, int[] parsedInts) { 1222 // TODO Update this to handle surrogates 1223 int[] p = new int[1]; 1224 int intCount = 0; // number of integers parsed 1225 for (int i=0; i<pattern.length(); ++i) { 1226 char cpat = pattern.charAt(i); 1227 char c; 1228 switch (cpat) { 1229 case ' ': 1230 if (pos >= limit) { 1231 return -1; 1232 } 1233 c = rule.charAt(pos++); 1234 if (!PatternProps.isWhiteSpace(c)) { 1235 return -1; 1236 } 1237 // FALL THROUGH to skipWhitespace 1238 case '~': 1239 pos = PatternProps.skipWhiteSpace(rule, pos); 1240 break; 1241 case '#': 1242 p[0] = pos; 1243 parsedInts[intCount++] = parseInteger(rule, p, limit); 1244 if (p[0] == pos) { 1245 // Syntax error; failed to parse integer 1246 return -1; 1247 } 1248 pos = p[0]; 1249 break; 1250 default: 1251 if (pos >= limit) { 1252 return -1; 1253 } 1254 c = (char) UCharacter.toLowerCase(rule.charAt(pos++)); 1255 if (c != cpat) { 1256 return -1; 1257 } 1258 break; 1259 } 1260 } 1261 return pos; 1262 } 1263 1264 /** 1265 * Parse a pattern string within the given Replaceable and a parsing 1266 * pattern. Characters are matched literally and case-sensitively 1267 * except for the following special characters: 1268 * 1269 * ~ zero or more Pattern_White_Space chars 1270 * 1271 * If end of pattern is reached with all matches along the way, 1272 * pos is advanced to the first unparsed index and returned. 1273 * Otherwise -1 is returned. 1274 * @param pat pattern that controls parsing 1275 * @param text text to be parsed, starting at index 1276 * @param index offset to first character to parse 1277 * @param limit offset after last character to parse 1278 * @return index after last parsed character, or -1 on parse failure. 1279 */ 1280 public static int parsePattern(String pat, 1281 Replaceable text, 1282 int index, 1283 int limit) { 1284 int ipat = 0; 1285 1286 // empty pattern matches immediately 1287 if (ipat == pat.length()) { 1288 return index; 1289 } 1290 1291 int cpat = Character.codePointAt(pat, ipat); 1292 1293 while (index < limit) { 1294 int c = text.char32At(index); 1295 1296 // parse \s* 1297 if (cpat == '~') { 1298 if (PatternProps.isWhiteSpace(c)) { 1299 index += UTF16.getCharCount(c); 1300 continue; 1301 } else { 1302 if (++ipat == pat.length()) { 1303 return index; // success; c unparsed 1304 } 1305 // fall thru; process c again with next cpat 1306 } 1307 } 1308 1309 // parse literal 1310 else if (c == cpat) { 1311 int n = UTF16.getCharCount(c); 1312 index += n; 1313 ipat += n; 1314 if (ipat == pat.length()) { 1315 return index; // success; c parsed 1316 } 1317 // fall thru; get next cpat 1318 } 1319 1320 // match failure of literal 1321 else { 1322 return -1; 1323 } 1324 1325 cpat = UTF16.charAt(pat, ipat); 1326 } 1327 1328 return -1; // text ended before end of pat 1329 } 1330 1331 /** 1332 * Parse an integer at pos, either of the form \d+ or of the form 1333 * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex, 1334 * or octal format. 1335 * @param pos INPUT-OUTPUT parameter. On input, the first 1336 * character to parse. On output, the character after the last 1337 * parsed character. 1338 */ 1339 public static int parseInteger(String rule, int[] pos, int limit) { 1340 int count = 0; 1341 int value = 0; 1342 int p = pos[0]; 1343 int radix = 10; 1344 1345 if (rule.regionMatches(true, p, "0x", 0, 2)) { 1346 p += 2; 1347 radix = 16; 1348 } else if (p < limit && rule.charAt(p) == '0') { 1349 p++; 1350 count = 1; 1351 radix = 8; 1352 } 1353 1354 while (p < limit) { 1355 int d = UCharacter.digit(rule.charAt(p++), radix); 1356 if (d < 0) { 1357 --p; 1358 break; 1359 } 1360 ++count; 1361 int v = (value * radix) + d; 1362 if (v <= value) { 1363 // If there are too many input digits, at some point 1364 // the value will go negative, e.g., if we have seen 1365 // "0x8000000" already and there is another '0', when 1366 // we parse the next 0 the value will go negative. 1367 return 0; 1368 } 1369 value = v; 1370 } 1371 if (count > 0) { 1372 pos[0] = p; 1373 } 1374 return value; 1375 } 1376 1377 /** 1378 * Parse a Unicode identifier from the given string at the given 1379 * position. Return the identifier, or null if there is no 1380 * identifier. 1381 * @param str the string to parse 1382 * @param pos INPUT-OUTPUT parameter. On INPUT, pos[0] is the 1383 * first character to examine. It must be less than str.length(), 1384 * and it must not point to a whitespace character. That is, must 1385 * have pos[0] < str.length(). On 1386 * OUTPUT, the position after the last parsed character. 1387 * @return the Unicode identifier, or null if there is no valid 1388 * identifier at pos[0]. 1389 */ 1390 public static String parseUnicodeIdentifier(String str, int[] pos) { 1391 // assert(pos[0] < str.length()); 1392 StringBuilder buf = new StringBuilder(); 1393 int p = pos[0]; 1394 while (p < str.length()) { 1395 int ch = Character.codePointAt(str, p); 1396 if (buf.length() == 0) { 1397 if (UCharacter.isUnicodeIdentifierStart(ch)) { 1398 buf.appendCodePoint(ch); 1399 } else { 1400 return null; 1401 } 1402 } else { 1403 if (UCharacter.isUnicodeIdentifierPart(ch)) { 1404 buf.appendCodePoint(ch); 1405 } else { 1406 break; 1407 } 1408 } 1409 p += UTF16.getCharCount(ch); 1410 } 1411 pos[0] = p; 1412 return buf.toString(); 1413 } 1414 1415 static final char DIGITS[] = { 1416 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 1417 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 1418 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 1419 'U', 'V', 'W', 'X', 'Y', 'Z' 1420 }; 1421 1422 /** 1423 * Append the digits of a positive integer to the given 1424 * <code>Appendable</code> in the given radix. This is 1425 * done recursively since it is easiest to generate the low- 1426 * order digit first, but it must be appended last. 1427 * 1428 * @param result is the <code>Appendable</code> to append to 1429 * @param n is the positive integer 1430 * @param radix is the radix, from 2 to 36 inclusive 1431 * @param minDigits is the minimum number of digits to append. 1432 */ 1433 private static <T extends Appendable> void recursiveAppendNumber(T result, int n, 1434 int radix, int minDigits) 1435 { 1436 try { 1437 int digit = n % radix; 1438 1439 if (n >= radix || minDigits > 1) { 1440 recursiveAppendNumber(result, n / radix, radix, minDigits - 1); 1441 } 1442 result.append(DIGITS[digit]); 1443 } catch (IOException e) { 1444 throw new IllegalIcuArgumentException(e); 1445 } 1446 } 1447 1448 /** 1449 * Append a number to the given Appendable in the given radix. 1450 * Standard digits '0'-'9' are used and letters 'A'-'Z' for 1451 * radices 11 through 36. 1452 * @param result the digits of the number are appended here 1453 * @param n the number to be converted to digits; may be negative. 1454 * If negative, a '-' is prepended to the digits. 1455 * @param radix a radix from 2 to 36 inclusive. 1456 * @param minDigits the minimum number of digits, not including 1457 * any '-', to produce. Values less than 2 have no effect. One 1458 * digit is always emitted regardless of this parameter. 1459 * @return a reference to result 1460 */ 1461 public static <T extends Appendable> T appendNumber(T result, int n, 1462 int radix, int minDigits) 1463 { 1464 try { 1465 if (radix < 2 || radix > 36) { 1466 throw new IllegalArgumentException("Illegal radix " + radix); 1467 } 1468 1469 1470 int abs = n; 1471 1472 if (n < 0) { 1473 abs = -n; 1474 result.append("-"); 1475 } 1476 1477 recursiveAppendNumber(result, abs, radix, minDigits); 1478 1479 return result; 1480 } catch (IOException e) { 1481 throw new IllegalIcuArgumentException(e); 1482 } 1483 1484 } 1485 1486 /** 1487 * Parse an unsigned 31-bit integer at the given offset. Use 1488 * UCharacter.digit() to parse individual characters into digits. 1489 * @param text the text to be parsed 1490 * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the 1491 * offset within text at which to start parsing; it should point 1492 * to a valid digit. On exit, pos[0] is the offset after the last 1493 * parsed character. If the parse failed, it will be unchanged on 1494 * exit. Must be >= 0 on entry. 1495 * @param radix the radix in which to parse; must be >= 2 and <= 1496 * 36. 1497 * @return a non-negative parsed number, or -1 upon parse failure. 1498 * Parse fails if there are no digits, that is, if pos[0] does not 1499 * point to a valid digit on entry, or if the number to be parsed 1500 * does not fit into a 31-bit unsigned integer. 1501 */ 1502 public static int parseNumber(String text, int[] pos, int radix) { 1503 // assert(pos[0] >= 0); 1504 // assert(radix >= 2); 1505 // assert(radix <= 36); 1506 int n = 0; 1507 int p = pos[0]; 1508 while (p < text.length()) { 1509 int ch = Character.codePointAt(text, p); 1510 int d = UCharacter.digit(ch, radix); 1511 if (d < 0) { 1512 break; 1513 } 1514 n = radix*n + d; 1515 // ASSUME that when a 32-bit integer overflows it becomes 1516 // negative. E.g., 214748364 * 10 + 8 => negative value. 1517 if (n < 0) { 1518 return -1; 1519 } 1520 ++p; 1521 } 1522 if (p == pos[0]) { 1523 return -1; 1524 } 1525 pos[0] = p; 1526 return n; 1527 } 1528 1529 /** 1530 * Return true if the character is NOT printable ASCII. The tab, 1531 * newline and linefeed characters are considered unprintable. 1532 */ 1533 public static boolean isUnprintable(int c) { 1534 //0x20 = 32 and 0x7E = 126 1535 return !(c >= 0x20 && c <= 0x7E); 1536 } 1537 1538 /** 1539 * @return true for control codes and for surrogate and noncharacter code points 1540 */ 1541 public static boolean shouldAlwaysBeEscaped(int c) { 1542 if (c < 0x20) { 1543 return true; // C0 control codes 1544 } else if (c <= 0x7e) { 1545 return false; // printable ASCII 1546 } else if (c <= 0x9f) { 1547 return true; // C1 control codes 1548 } else if (c < 0xd800) { 1549 return false; // most of the BMP 1550 } else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) { 1551 return true; // surrogate or noncharacter code points 1552 } else if (c <= 0x10ffff) { 1553 return false; // all else 1554 } else { 1555 return true; // not a code point 1556 } 1557 } 1558 1559 /** 1560 * Escapes one unprintable code point using <backslash>uxxxx notation 1561 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and 1562 * above. If the character is printable ASCII, then do nothing 1563 * and return false. Otherwise, append the escaped notation and 1564 * return true. 1565 */ 1566 public static <T extends Appendable> boolean escapeUnprintable(T result, int c) { 1567 if (isUnprintable(c)) { 1568 escape(result, c); 1569 return true; 1570 } 1571 return false; 1572 } 1573 1574 /** 1575 * Escapes one code point using <backslash>uxxxx notation 1576 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and above. 1577 * @return result 1578 */ 1579 public static <T extends Appendable> T escape(T result, int c) { 1580 try { 1581 result.append('\\'); 1582 if ((c & ~0xFFFF) != 0) { 1583 result.append('U'); 1584 result.append(DIGITS[0xF&(c>>28)]); 1585 result.append(DIGITS[0xF&(c>>24)]); 1586 result.append(DIGITS[0xF&(c>>20)]); 1587 result.append(DIGITS[0xF&(c>>16)]); 1588 } else { 1589 result.append('u'); 1590 } 1591 result.append(DIGITS[0xF&(c>>12)]); 1592 result.append(DIGITS[0xF&(c>>8)]); 1593 result.append(DIGITS[0xF&(c>>4)]); 1594 result.append(DIGITS[0xF&c]); 1595 return result; 1596 } catch (IOException e) { 1597 throw new ICUUncheckedIOException(e); 1598 } 1599 } 1600 1601 /** 1602 * Returns the index of the first character in a set, ignoring quoted text. 1603 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be 1604 * found by a search for "h". Unlike String.indexOf(), this method searches 1605 * not for a single character, but for any character of the string 1606 * <code>setOfChars</code>. 1607 * @param text text to be searched 1608 * @param start the beginning index, inclusive; <code>0 <= start 1609 * <= limit</code>. 1610 * @param limit the ending index, exclusive; <code>start <= limit 1611 * <= text.length()</code>. 1612 * @param setOfChars string with one or more distinct characters 1613 * @return Offset of the first character in <code>setOfChars</code> 1614 * found, or -1 if not found. 1615 * @see String#indexOf 1616 */ 1617 public static int quotedIndexOf(String text, int start, int limit, 1618 String setOfChars) { 1619 for (int i=start; i<limit; ++i) { 1620 char c = text.charAt(i); 1621 if (c == BACKSLASH) { 1622 ++i; 1623 } else if (c == APOSTROPHE) { 1624 while (++i < limit 1625 && text.charAt(i) != APOSTROPHE) {} 1626 } else if (setOfChars.indexOf(c) >= 0) { 1627 return i; 1628 } 1629 } 1630 return -1; 1631 } 1632 1633 /** 1634 * Append a character to a rule that is being built up. To flush 1635 * the quoteBuf to rule, make one final call with isLiteral == true. 1636 * If there is no final character, pass in (int)-1 as c. 1637 * @param rule the string to append the character to 1638 * @param c the character to append, or (int)-1 if none. 1639 * @param isLiteral if true, then the given character should not be 1640 * quoted or escaped. Usually this means it is a syntactic element 1641 * such as > or $ 1642 * @param escapeUnprintable if true, then unprintable characters 1643 * should be escaped using escapeUnprintable(). These escapes will 1644 * appear outside of quotes. 1645 * @param quoteBuf a buffer which is used to build up quoted 1646 * substrings. The caller should initially supply an empty buffer, 1647 * and thereafter should not modify the buffer. The buffer should be 1648 * cleared out by, at the end, calling this method with a literal 1649 * character (which may be -1). 1650 */ 1651 public static void appendToRule(StringBuffer rule, 1652 int c, 1653 boolean isLiteral, 1654 boolean escapeUnprintable, 1655 StringBuffer quoteBuf) { 1656 // If we are escaping unprintables, then escape them outside 1657 // quotes. \\u and \\U are not recognized within quotes. The same 1658 // logic applies to literals, but literals are never escaped. 1659 if (isLiteral || 1660 (escapeUnprintable && Utility.isUnprintable(c))) { 1661 if (quoteBuf.length() > 0) { 1662 // We prefer backslash APOSTROPHE to double APOSTROPHE 1663 // (more readable, less similar to ") so if there are 1664 // double APOSTROPHEs at the ends, we pull them outside 1665 // of the quote. 1666 1667 // If the first thing in the quoteBuf is APOSTROPHE 1668 // (doubled) then pull it out. 1669 while (quoteBuf.length() >= 2 && 1670 quoteBuf.charAt(0) == APOSTROPHE && 1671 quoteBuf.charAt(1) == APOSTROPHE) { 1672 rule.append(BACKSLASH).append(APOSTROPHE); 1673 quoteBuf.delete(0, 2); 1674 } 1675 // If the last thing in the quoteBuf is APOSTROPHE 1676 // (doubled) then remove and count it and add it after. 1677 int trailingCount = 0; 1678 while (quoteBuf.length() >= 2 && 1679 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE && 1680 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) { 1681 quoteBuf.setLength(quoteBuf.length()-2); 1682 ++trailingCount; 1683 } 1684 if (quoteBuf.length() > 0) { 1685 rule.append(APOSTROPHE); 1686 rule.append(quoteBuf); 1687 rule.append(APOSTROPHE); 1688 quoteBuf.setLength(0); 1689 } 1690 while (trailingCount-- > 0) { 1691 rule.append(BACKSLASH).append(APOSTROPHE); 1692 } 1693 } 1694 if (c != -1) { 1695 /* Since spaces are ignored during parsing, they are 1696 * emitted only for readability. We emit one here 1697 * only if there isn't already one at the end of the 1698 * rule. 1699 */ 1700 if (c == ' ') { 1701 int len = rule.length(); 1702 if (len > 0 && rule.charAt(len-1) != ' ') { 1703 rule.append(' '); 1704 } 1705 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) { 1706 rule.appendCodePoint(c); 1707 } 1708 } 1709 } 1710 1711 // Escape ' and '\' and don't begin a quote just for them 1712 else if (quoteBuf.length() == 0 && 1713 (c == APOSTROPHE || c == BACKSLASH)) { 1714 rule.append(BACKSLASH).append((char)c); 1715 } 1716 1717 // Specials (printable ascii that isn't [0-9a-zA-Z]) and 1718 // whitespace need quoting. Also append stuff to quotes if we are 1719 // building up a quoted substring already. 1720 else if (quoteBuf.length() > 0 || 1721 (c >= 0x0021 && c <= 0x007E && 1722 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 1723 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 1724 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || 1725 PatternProps.isWhiteSpace(c)) { 1726 quoteBuf.appendCodePoint(c); 1727 // Double ' within a quote 1728 if (c == APOSTROPHE) { 1729 quoteBuf.append((char)c); 1730 } 1731 } 1732 1733 // Otherwise just append 1734 else { 1735 rule.appendCodePoint(c); 1736 } 1737 } 1738 1739 /** 1740 * Append the given string to the rule. Calls the single-character 1741 * version of appendToRule for each character. 1742 */ 1743 public static void appendToRule(StringBuffer rule, 1744 String text, 1745 boolean isLiteral, 1746 boolean escapeUnprintable, 1747 StringBuffer quoteBuf) { 1748 for (int i=0; i<text.length(); ++i) { 1749 // Okay to process in 16-bit code units here 1750 appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf); 1751 } 1752 } 1753 1754 /** 1755 * Given a matcher reference, which may be null, append its 1756 * pattern as a literal to the given rule. 1757 */ 1758 public static void appendToRule(StringBuffer rule, 1759 UnicodeMatcher matcher, 1760 boolean escapeUnprintable, 1761 StringBuffer quoteBuf) { 1762 if (matcher != null) { 1763 appendToRule(rule, matcher.toPattern(escapeUnprintable), 1764 true, escapeUnprintable, quoteBuf); 1765 } 1766 } 1767 1768 /** 1769 * Compares 2 unsigned integers 1770 * @param source 32 bit unsigned integer 1771 * @param target 32 bit unsigned integer 1772 * @return 0 if equals, 1 if source is greater than target and -1 1773 * otherwise 1774 */ 1775 public static final int compareUnsigned(int source, int target) 1776 { 1777 source += MAGIC_UNSIGNED; 1778 target += MAGIC_UNSIGNED; 1779 if (source < target) { 1780 return -1; 1781 } 1782 else if (source > target) { 1783 return 1; 1784 } 1785 return 0; 1786 } 1787 1788 /** 1789 * Find the highest bit in a positive integer. This is done 1790 * by doing a binary search through the bits. 1791 * 1792 * @param n is the integer 1793 * 1794 * @return the bit number of the highest bit, with 0 being 1795 * the low order bit, or -1 if <code>n</code> is not positive 1796 */ 1797 public static final byte highBit(int n) 1798 { 1799 if (n <= 0) { 1800 return -1; 1801 } 1802 1803 byte bit = 0; 1804 1805 if (n >= 1 << 16) { 1806 n >>= 16; 1807 bit += 16; 1808 } 1809 1810 if (n >= 1 << 8) { 1811 n >>= 8; 1812 bit += 8; 1813 } 1814 1815 if (n >= 1 << 4) { 1816 n >>= 4; 1817 bit += 4; 1818 } 1819 1820 if (n >= 1 << 2) { 1821 n >>= 2; 1822 bit += 2; 1823 } 1824 1825 if (n >= 1 << 1) { 1826 n >>= 1; 1827 bit += 1; 1828 } 1829 1830 return bit; 1831 } 1832 /** 1833 * Utility method to take a int[] containing codepoints and return 1834 * a string representation with code units. 1835 */ valueOf(int[]source)1836 public static String valueOf(int[]source){ 1837 // TODO: Investigate why this method is not on UTF16 class 1838 StringBuilder result = new StringBuilder(source.length); 1839 for(int i=0; i<source.length; i++){ 1840 result.appendCodePoint(source[i]); 1841 } 1842 return result.toString(); 1843 } 1844 1845 1846 /** 1847 * Utility to duplicate a string count times 1848 * @param s String to be duplicated. 1849 * @param count Number of times to duplicate a string. 1850 */ repeat(String s, int count)1851 public static String repeat(String s, int count) { 1852 if (count <= 0) return ""; 1853 if (count == 1) return s; 1854 StringBuilder result = new StringBuilder(); 1855 for (int i = 0; i < count; ++i) { 1856 result.append(s); 1857 } 1858 return result.toString(); 1859 } 1860 splitString(String src, String target)1861 public static String[] splitString(String src, String target) { 1862 return src.split("\\Q" + target + "\\E"); 1863 } 1864 1865 /** 1866 * Split the string at runs of ascii whitespace characters. 1867 */ splitWhitespace(String src)1868 public static String[] splitWhitespace(String src) { 1869 return src.split("\\s+"); 1870 } 1871 1872 /** 1873 * Parse a list of hex numbers and return a string 1874 * @param string String of hex numbers. 1875 * @param minLength Minimal length. 1876 * @param separator Separator. 1877 * @return A string from hex numbers. 1878 */ fromHex(String string, int minLength, String separator)1879 public static String fromHex(String string, int minLength, String separator) { 1880 return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+")); 1881 } 1882 1883 /** 1884 * Parse a list of hex numbers and return a string 1885 * @param string String of hex numbers. 1886 * @param minLength Minimal length. 1887 * @param separator Separator. 1888 * @return A string from hex numbers. 1889 */ fromHex(String string, int minLength, Pattern separator)1890 public static String fromHex(String string, int minLength, Pattern separator) { 1891 StringBuilder buffer = new StringBuilder(); 1892 String[] parts = separator.split(string); 1893 for (String part : parts) { 1894 if (part.length() < minLength) { 1895 throw new IllegalArgumentException("code point too short: " + part); 1896 } 1897 int cp = Integer.parseInt(part, 16); 1898 buffer.appendCodePoint(cp); 1899 } 1900 return buffer.toString(); 1901 } 1902 1903 /** 1904 * This implementation is equivalent to Java 8+ Math#addExact(int, int) 1905 * @param x the first value 1906 * @param y the second value 1907 * @return the result 1908 */ addExact(int x, int y)1909 public static int addExact(int x, int y) { 1910 int r = x + y; 1911 // HD 2-12 Overflow iff both arguments have the opposite sign of the result 1912 if (((x ^ r) & (y ^ r)) < 0) { 1913 throw new ArithmeticException("integer overflow"); 1914 } 1915 return r; 1916 } 1917 1918 /** 1919 * Returns whether the chars in the two CharSequences are equal. 1920 */ charSequenceEquals(CharSequence a, CharSequence b)1921 public static boolean charSequenceEquals(CharSequence a, CharSequence b) { 1922 if (a == b) { 1923 return true; 1924 } 1925 if (a == null || b == null) { 1926 return false; 1927 } 1928 if (a.length() != b.length()) { 1929 return false; 1930 } 1931 for (int i = 0; i < a.length(); i++) { 1932 if (a.charAt(i) != b.charAt(i)) 1933 return false; 1934 } 1935 return true; 1936 } 1937 1938 /** 1939 * Returns a hash code for a CharSequence that is equivalent to calling 1940 * charSequence.toString().hashCode() 1941 */ charSequenceHashCode(CharSequence value)1942 public static int charSequenceHashCode(CharSequence value) { 1943 int hash = 0; 1944 for (int i = 0; i < value.length(); i++) { 1945 hash = hash * 31 + value.charAt(i); 1946 } 1947 return hash; 1948 } 1949 1950 /** 1951 * Appends a CharSequence to an Appendable, converting IOException to ICUUncheckedIOException. 1952 */ appendTo(CharSequence string, A appendable)1953 public static <A extends Appendable> A appendTo(CharSequence string, A appendable) { 1954 try { 1955 appendable.append(string); 1956 return appendable; 1957 } catch (IOException e) { 1958 throw new ICUUncheckedIOException(e); 1959 } 1960 } 1961 1962 /** 1963 * Java 8+ String#join(CharSequence, Iterable<? extends CharSequence>) compatible method for Java 7 env. 1964 * @param delimiter the delimiter that separates each element 1965 * @param elements the elements to join together. 1966 * @return a new String that is composed of the elements separated by the delimiter 1967 * @throws NullPointerException If delimiter or elements is null 1968 */ joinStrings(CharSequence delimiter, Iterable<? extends CharSequence> elements)1969 public static String joinStrings(CharSequence delimiter, Iterable<? extends CharSequence> elements) { 1970 if (delimiter == null || elements == null) { 1971 throw new NullPointerException("Delimiter or elements is null"); 1972 } 1973 StringBuilder buf = new StringBuilder(); 1974 Iterator<? extends CharSequence> itr = elements.iterator(); 1975 boolean isFirstElem = true; 1976 while (itr.hasNext()) { 1977 CharSequence element = itr.next(); 1978 if (element != null) { 1979 if (!isFirstElem) { 1980 buf.append(delimiter); 1981 } else { 1982 isFirstElem = false; 1983 } 1984 buf.append(element); 1985 } 1986 } 1987 return buf.toString(); 1988 } 1989 } 1990