1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import java.io.IOException; 12 import java.util.ArrayList; 13 import java.util.Iterator; 14 import java.util.Locale; 15 import java.util.regex.Pattern; 16 17 import com.ibm.icu.lang.UCharacter; 18 import com.ibm.icu.text.Replaceable; 19 import com.ibm.icu.text.UTF16; 20 import com.ibm.icu.text.UnicodeMatcher; 21 import com.ibm.icu.util.ICUUncheckedIOException; 22 23 public final class Utility { 24 25 private static final char APOSTROPHE = '\''; 26 private static final char BACKSLASH = '\\'; 27 private static final int MAGIC_UNSIGNED = 0x80000000; 28 29 /** 30 * Convenience utility to compare two Object[]s. 31 * Ought to be in System 32 */ arrayEquals(Object[] source, Object target)33 public final static boolean arrayEquals(Object[] source, Object target) { 34 if (source == null) return (target == null); 35 if (!(target instanceof Object[])) return false; 36 Object[] targ = (Object[]) target; 37 return (source.length == targ.length 38 && arrayRegionMatches(source, 0, targ, 0, source.length)); 39 } 40 41 /** 42 * Convenience utility to compare two int[]s 43 * Ought to be in System 44 */ arrayEquals(int[] source, Object target)45 public final static boolean arrayEquals(int[] source, Object target) { 46 if (source == null) return (target == null); 47 if (!(target instanceof int[])) return false; 48 int[] targ = (int[]) target; 49 return (source.length == targ.length 50 && arrayRegionMatches(source, 0, targ, 0, source.length)); 51 } 52 53 /** 54 * Convenience utility to compare two double[]s 55 * Ought to be in System 56 */ arrayEquals(double[] source, Object target)57 public final static boolean arrayEquals(double[] source, Object target) { 58 if (source == null) return (target == null); 59 if (!(target instanceof double[])) return false; 60 double[] targ = (double[]) target; 61 return (source.length == targ.length 62 && arrayRegionMatches(source, 0, targ, 0, source.length)); 63 } arrayEquals(byte[] source, Object target)64 public final static boolean arrayEquals(byte[] source, Object target) { 65 if (source == null) return (target == null); 66 if (!(target instanceof byte[])) return false; 67 byte[] targ = (byte[]) target; 68 return (source.length == targ.length 69 && arrayRegionMatches(source, 0, targ, 0, source.length)); 70 } 71 72 /** 73 * Convenience utility to compare two Object[]s 74 * Ought to be in System 75 */ arrayEquals(Object source, Object target)76 public final static boolean arrayEquals(Object source, Object target) { 77 if (source == null) return (target == null); 78 // for some reason, the correct arrayEquals is not being called 79 // so do it by hand for now. 80 if (source instanceof Object[]) 81 return(arrayEquals((Object[]) source,target)); 82 if (source instanceof int[]) 83 return(arrayEquals((int[]) source,target)); 84 if (source instanceof double[]) 85 return(arrayEquals((double[]) source, target)); 86 if (source instanceof byte[]) 87 return(arrayEquals((byte[]) source,target)); 88 return source.equals(target); 89 } 90 91 /** 92 * Convenience utility to compare two Object[]s 93 * Ought to be in System. 94 * @param len the length to compare. 95 * The start indices and start+len must be valid. 96 */ arrayRegionMatches(Object[] source, int sourceStart, Object[] target, int targetStart, int len)97 public final static boolean arrayRegionMatches(Object[] source, int sourceStart, 98 Object[] target, int targetStart, 99 int len) 100 { 101 int sourceEnd = sourceStart + len; 102 int delta = targetStart - sourceStart; 103 for (int i = sourceStart; i < sourceEnd; i++) { 104 if (!arrayEquals(source[i],target[i + delta])) 105 return false; 106 } 107 return true; 108 } 109 110 /** 111 * Convenience utility to compare two Object[]s 112 * Ought to be in System. 113 * @param len the length to compare. 114 * The start indices and start+len must be valid. 115 */ arrayRegionMatches(char[] source, int sourceStart, char[] target, int targetStart, int len)116 public final static boolean arrayRegionMatches(char[] source, int sourceStart, 117 char[] target, int targetStart, 118 int len) 119 { 120 int sourceEnd = sourceStart + len; 121 int delta = targetStart - sourceStart; 122 for (int i = sourceStart; i < sourceEnd; i++) { 123 if (source[i]!=target[i + delta]) 124 return false; 125 } 126 return true; 127 } 128 129 /** 130 * Convenience utility to compare two int[]s. 131 * @param len the length to compare. 132 * The start indices and start+len must be valid. 133 * Ought to be in System 134 */ arrayRegionMatches(int[] source, int sourceStart, int[] target, int targetStart, int len)135 public final static boolean arrayRegionMatches(int[] source, int sourceStart, 136 int[] target, int targetStart, 137 int len) 138 { 139 int sourceEnd = sourceStart + len; 140 int delta = targetStart - sourceStart; 141 for (int i = sourceStart; i < sourceEnd; i++) { 142 if (source[i] != target[i + delta]) 143 return false; 144 } 145 return true; 146 } 147 148 /** 149 * Convenience utility to compare two arrays of doubles. 150 * @param len the length to compare. 151 * The start indices and start+len must be valid. 152 * Ought to be in System 153 */ arrayRegionMatches(double[] source, int sourceStart, double[] target, int targetStart, int len)154 public final static boolean arrayRegionMatches(double[] source, int sourceStart, 155 double[] target, int targetStart, 156 int len) 157 { 158 int sourceEnd = sourceStart + len; 159 int delta = targetStart - sourceStart; 160 for (int i = sourceStart; i < sourceEnd; i++) { 161 if (source[i] != target[i + delta]) 162 return false; 163 } 164 return true; 165 } arrayRegionMatches(byte[] source, int sourceStart, byte[] target, int targetStart, int len)166 public final static boolean arrayRegionMatches(byte[] source, int sourceStart, 167 byte[] target, int targetStart, int len){ 168 int sourceEnd = sourceStart + len; 169 int delta = targetStart - sourceStart; 170 for (int i = sourceStart; i < sourceEnd; i++) { 171 if (source[i] != target[i + delta]) 172 return false; 173 } 174 return true; 175 } 176 177 /** 178 * Trivial reference equality. 179 * This method should help document that we really want == not equals(), 180 * and to have a single place to suppress warnings from static analysis tools. 181 */ sameObjects(Object a, Object b)182 public static final boolean sameObjects(Object a, Object b) { 183 return a == b; 184 } 185 186 /** 187 * Convenience utility. Does null checks on objects, then calls compare. 188 */ checkCompare(T a, T b)189 public static <T extends Comparable<T>> int checkCompare(T a, T b) { 190 return a == null ? 191 b == null ? 0 : -1 : 192 b == null ? 1 : a.compareTo(b); 193 } 194 195 /** 196 * Convenience utility. Does null checks on object, then calls hashCode. 197 */ checkHash(Object a)198 public static int checkHash(Object a) { 199 return a == null ? 0 : a.hashCode(); 200 } 201 202 /** 203 * The ESCAPE character is used during run-length encoding. It signals 204 * a run of identical chars. 205 */ 206 private static final char ESCAPE = '\uA5A5'; 207 208 /** 209 * The ESCAPE_BYTE character is used during run-length encoding. It signals 210 * a run of identical bytes. 211 */ 212 static final byte ESCAPE_BYTE = (byte)0xA5; 213 214 /** 215 * Construct a string representing an int array. Use run-length encoding. 216 * A character represents itself, unless it is the ESCAPE character. Then 217 * the following notations are possible: 218 * ESCAPE ESCAPE ESCAPE literal 219 * ESCAPE n c n instances of character c 220 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 221 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 222 * If we encounter a run where n == ESCAPE, we represent this as: 223 * c ESCAPE n-1 c 224 * The ESCAPE value is chosen so as not to collide with commonly 225 * seen values. 226 */ arrayToRLEString(int[] a)227 static public final String arrayToRLEString(int[] a) { 228 StringBuilder buffer = new StringBuilder(); 229 230 appendInt(buffer, a.length); 231 int runValue = a[0]; 232 int runLength = 1; 233 for (int i=1; i<a.length; ++i) { 234 int s = a[i]; 235 if (s == runValue && runLength < 0xFFFF) { 236 ++runLength; 237 } else { 238 encodeRun(buffer, runValue, runLength); 239 runValue = s; 240 runLength = 1; 241 } 242 } 243 encodeRun(buffer, runValue, runLength); 244 return buffer.toString(); 245 } 246 247 /** 248 * Construct a string representing a short array. Use run-length encoding. 249 * A character represents itself, unless it is the ESCAPE character. Then 250 * the following notations are possible: 251 * ESCAPE ESCAPE ESCAPE literal 252 * ESCAPE n c n instances of character c 253 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 254 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 255 * If we encounter a run where n == ESCAPE, we represent this as: 256 * c ESCAPE n-1 c 257 * The ESCAPE value is chosen so as not to collide with commonly 258 * seen values. 259 */ arrayToRLEString(short[] a)260 static public final String arrayToRLEString(short[] a) { 261 StringBuilder buffer = new StringBuilder(); 262 // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]); 263 buffer.append((char) (a.length >> 16)); 264 buffer.append((char) a.length); 265 short runValue = a[0]; 266 int runLength = 1; 267 for (int i=1; i<a.length; ++i) { 268 short s = a[i]; 269 if (s == runValue && runLength < 0xFFFF) ++runLength; 270 else { 271 encodeRun(buffer, runValue, runLength); 272 runValue = s; 273 runLength = 1; 274 } 275 } 276 encodeRun(buffer, runValue, runLength); 277 return buffer.toString(); 278 } 279 280 /** 281 * Construct a string representing a char array. Use run-length encoding. 282 * A character represents itself, unless it is the ESCAPE character. Then 283 * the following notations are possible: 284 * ESCAPE ESCAPE ESCAPE literal 285 * ESCAPE n c n instances of character c 286 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 287 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 288 * If we encounter a run where n == ESCAPE, we represent this as: 289 * c ESCAPE n-1 c 290 * The ESCAPE value is chosen so as not to collide with commonly 291 * seen values. 292 */ arrayToRLEString(char[] a)293 static public final String arrayToRLEString(char[] a) { 294 StringBuilder buffer = new StringBuilder(); 295 buffer.append((char) (a.length >> 16)); 296 buffer.append((char) a.length); 297 char runValue = a[0]; 298 int runLength = 1; 299 for (int i=1; i<a.length; ++i) { 300 char s = a[i]; 301 if (s == runValue && runLength < 0xFFFF) ++runLength; 302 else { 303 encodeRun(buffer, (short)runValue, runLength); 304 runValue = s; 305 runLength = 1; 306 } 307 } 308 encodeRun(buffer, (short)runValue, runLength); 309 return buffer.toString(); 310 } 311 312 /** 313 * Construct a string representing a byte array. Use run-length encoding. 314 * Two bytes are packed into a single char, with a single extra zero byte at 315 * the end if needed. A byte represents itself, unless it is the 316 * ESCAPE_BYTE. Then the following notations are possible: 317 * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal 318 * ESCAPE_BYTE n b n instances of byte b 319 * Since an encoded run occupies 3 bytes, we only encode runs of 4 or 320 * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF. 321 * If we encounter a run where n == ESCAPE_BYTE, we represent this as: 322 * b ESCAPE_BYTE n-1 b 323 * The ESCAPE_BYTE value is chosen so as not to collide with commonly 324 * seen values. 325 */ arrayToRLEString(byte[] a)326 static public final String arrayToRLEString(byte[] a) { 327 StringBuilder buffer = new StringBuilder(); 328 buffer.append((char) (a.length >> 16)); 329 buffer.append((char) a.length); 330 byte runValue = a[0]; 331 int runLength = 1; 332 byte[] state = new byte[2]; 333 for (int i=1; i<a.length; ++i) { 334 byte b = a[i]; 335 if (b == runValue && runLength < 0xFF) ++runLength; 336 else { 337 encodeRun(buffer, runValue, runLength, state); 338 runValue = b; 339 runLength = 1; 340 } 341 } 342 encodeRun(buffer, runValue, runLength, state); 343 344 // We must save the final byte, if there is one, by padding 345 // an extra zero. 346 if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state); 347 348 return buffer.toString(); 349 } 350 351 /** 352 * Encode a run, possibly a degenerate run (of < 4 values). 353 * @param length The length of the run; must be > 0 && <= 0xFFFF. 354 */ encodeRun(T buffer, int value, int length)355 private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) { 356 if (length < 4) { 357 for (int j=0; j<length; ++j) { 358 if (value == ESCAPE) { 359 appendInt(buffer, value); 360 } 361 appendInt(buffer, value); 362 } 363 } 364 else { 365 if (length == ESCAPE) { 366 if (value == ESCAPE) { 367 appendInt(buffer, ESCAPE); 368 } 369 appendInt(buffer, value); 370 --length; 371 } 372 appendInt(buffer, ESCAPE); 373 appendInt(buffer, length); 374 appendInt(buffer, value); // Don't need to escape this value 375 } 376 } 377 appendInt(T buffer, int value)378 private static final <T extends Appendable> void appendInt(T buffer, int value) { 379 try { 380 buffer.append((char)(value >>> 16)); 381 buffer.append((char)(value & 0xFFFF)); 382 } catch (IOException e) { 383 throw new IllegalIcuArgumentException(e); 384 } 385 } 386 387 /** 388 * Encode a run, possibly a degenerate run (of < 4 values). 389 * @param length The length of the run; must be > 0 && <= 0xFFFF. 390 */ encodeRun(T buffer, short value, int length)391 private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) { 392 try { 393 char valueChar = (char) value; 394 if (length < 4) { 395 for (int j=0; j<length; ++j) { 396 if (valueChar == ESCAPE) { 397 buffer.append(ESCAPE); 398 } 399 buffer.append(valueChar); 400 } 401 } 402 else { 403 if (length == ESCAPE) { 404 if (valueChar == ESCAPE) { 405 buffer.append(ESCAPE); 406 } 407 buffer.append(valueChar); 408 --length; 409 } 410 buffer.append(ESCAPE); 411 buffer.append((char) length); 412 buffer.append(valueChar); // Don't need to escape this value 413 } 414 } catch (IOException e) { 415 throw new IllegalIcuArgumentException(e); 416 } 417 } 418 419 /** 420 * Encode a run, possibly a degenerate run (of < 4 values). 421 * @param length The length of the run; must be > 0 && <= 0xFF. 422 */ encodeRun(T buffer, byte value, int length, byte[] state)423 private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length, 424 byte[] state) { 425 if (length < 4) { 426 for (int j=0; j<length; ++j) { 427 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 428 appendEncodedByte(buffer, value, state); 429 } 430 } 431 else { 432 if ((byte)length == ESCAPE_BYTE) { 433 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 434 appendEncodedByte(buffer, value, state); 435 --length; 436 } 437 appendEncodedByte(buffer, ESCAPE_BYTE, state); 438 appendEncodedByte(buffer, (byte)length, state); 439 appendEncodedByte(buffer, value, state); // Don't need to escape this value 440 } 441 } 442 443 /** 444 * Append a byte to the given Appendable, packing two bytes into each 445 * character. The state parameter maintains intermediary data between 446 * calls. 447 * @param state A two-element array, with state[0] == 0 if this is the 448 * first byte of a pair, or state[0] != 0 if this is the second byte 449 * of a pair, in which case state[1] is the first byte. 450 */ appendEncodedByte(T buffer, byte value, byte[] state)451 private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value, 452 byte[] state) { 453 try { 454 if (state[0] != 0) { 455 char c = (char) ((state[1] << 8) | ((value) & 0xFF)); 456 buffer.append(c); 457 state[0] = 0; 458 } 459 else { 460 state[0] = 1; 461 state[1] = value; 462 } 463 } catch (IOException e) { 464 throw new IllegalIcuArgumentException(e); 465 } 466 } 467 468 /** 469 * Construct an array of ints from a run-length encoded string. 470 */ RLEStringToIntArray(String s)471 static public final int[] RLEStringToIntArray(String s) { 472 int length = getInt(s, 0); 473 int[] array = new int[length]; 474 int ai = 0, i = 1; 475 476 int maxI = s.length() / 2; 477 while (ai < length && i < maxI) { 478 int c = getInt(s, i++); 479 480 if (c == ESCAPE) { 481 c = getInt(s, i++); 482 if (c == ESCAPE) { 483 array[ai++] = c; 484 } else { 485 int runLength = c; 486 int runValue = getInt(s, i++); 487 for (int j=0; j<runLength; ++j) { 488 array[ai++] = runValue; 489 } 490 } 491 } 492 else { 493 array[ai++] = c; 494 } 495 } 496 497 if (ai != length || i != maxI) { 498 throw new IllegalStateException("Bad run-length encoded int array"); 499 } 500 501 return array; 502 } getInt(String s, int i)503 static final int getInt(String s, int i) { 504 return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1); 505 } 506 507 /** 508 * Construct an array of shorts from a run-length encoded string. 509 */ RLEStringToShortArray(String s)510 static public final short[] RLEStringToShortArray(String s) { 511 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 512 short[] array = new short[length]; 513 int ai = 0; 514 for (int i=2; i<s.length(); ++i) { 515 char c = s.charAt(i); 516 if (c == ESCAPE) { 517 c = s.charAt(++i); 518 if (c == ESCAPE) { 519 array[ai++] = (short) c; 520 } else { 521 int runLength = c; 522 short runValue = (short) s.charAt(++i); 523 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 524 } 525 } 526 else { 527 array[ai++] = (short) c; 528 } 529 } 530 531 if (ai != length) 532 throw new IllegalStateException("Bad run-length encoded short array"); 533 534 return array; 535 } 536 537 /** 538 * Construct an array of shorts from a run-length encoded string. 539 */ RLEStringToCharArray(String s)540 static public final char[] RLEStringToCharArray(String s) { 541 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 542 char[] array = new char[length]; 543 int ai = 0; 544 for (int i=2; i<s.length(); ++i) { 545 char c = s.charAt(i); 546 if (c == ESCAPE) { 547 c = s.charAt(++i); 548 if (c == ESCAPE) { 549 array[ai++] = c; 550 } else { 551 int runLength = c; 552 char runValue = s.charAt(++i); 553 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 554 } 555 } 556 else { 557 array[ai++] = c; 558 } 559 } 560 561 if (ai != length) 562 throw new IllegalStateException("Bad run-length encoded short array"); 563 564 return array; 565 } 566 567 /** 568 * Construct an array of bytes from a run-length encoded string. 569 */ RLEStringToByteArray(String s)570 static public final byte[] RLEStringToByteArray(String s) { 571 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 572 byte[] array = new byte[length]; 573 boolean nextChar = true; 574 char c = 0; 575 int node = 0; 576 int runLength = 0; 577 int i = 2; 578 for (int ai=0; ai<length; ) { 579 // This part of the loop places the next byte into the local 580 // variable 'b' each time through the loop. It keeps the 581 // current character in 'c' and uses the boolean 'nextChar' 582 // to see if we've taken both bytes out of 'c' yet. 583 byte b; 584 if (nextChar) { 585 c = s.charAt(i++); 586 b = (byte) (c >> 8); 587 nextChar = false; 588 } 589 else { 590 b = (byte) (c & 0xFF); 591 nextChar = true; 592 } 593 594 // This part of the loop is a tiny state machine which handles 595 // the parsing of the run-length encoding. This would be simpler 596 // if we could look ahead, but we can't, so we use 'node' to 597 // move between three nodes in the state machine. 598 switch (node) { 599 case 0: 600 // Normal idle node 601 if (b == ESCAPE_BYTE) { 602 node = 1; 603 } 604 else { 605 array[ai++] = b; 606 } 607 break; 608 case 1: 609 // We have seen one ESCAPE_BYTE; we expect either a second 610 // one, or a run length and value. 611 if (b == ESCAPE_BYTE) { 612 array[ai++] = ESCAPE_BYTE; 613 node = 0; 614 } 615 else { 616 runLength = b; 617 // Interpret signed byte as unsigned 618 if (runLength < 0) runLength += 0x100; 619 node = 2; 620 } 621 break; 622 case 2: 623 // We have seen an ESCAPE_BYTE and length byte. We interpret 624 // the next byte as the value to be repeated. 625 for (int j=0; j<runLength; ++j) array[ai++] = b; 626 node = 0; 627 break; 628 } 629 } 630 631 if (node != 0) 632 throw new IllegalStateException("Bad run-length encoded byte array"); 633 634 if (i != s.length()) 635 throw new IllegalStateException("Excess data in RLE byte array string"); 636 637 return array; 638 } 639 640 static public String LINE_SEPARATOR = System.getProperty("line.separator"); 641 642 /** 643 * Format a String for representation in a source file. This includes 644 * breaking it into lines and escaping characters using octal notation 645 * when necessary (control characters and double quotes). 646 */ formatForSource(String s)647 static public final String formatForSource(String s) { 648 StringBuilder buffer = new StringBuilder(); 649 for (int i=0; i<s.length();) { 650 if (i > 0) buffer.append('+').append(LINE_SEPARATOR); 651 buffer.append(" \""); 652 int count = 11; 653 while (i<s.length() && count<80) { 654 char c = s.charAt(i++); 655 if (c < '\u0020' || c == '"' || c == '\\') { 656 if (c == '\n') { 657 buffer.append("\\n"); 658 count += 2; 659 } else if (c == '\t') { 660 buffer.append("\\t"); 661 count += 2; 662 } else if (c == '\r') { 663 buffer.append("\\r"); 664 count += 2; 665 } else { 666 // Represent control characters, backslash and double quote 667 // using octal notation; otherwise the string we form 668 // won't compile, since Unicode escape sequences are 669 // processed before tokenization. 670 buffer.append('\\'); 671 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 672 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 673 buffer.append(HEX_DIGIT[(c & 0007)]); 674 count += 4; 675 } 676 } 677 else if (c <= '\u007E') { 678 buffer.append(c); 679 count += 1; 680 } 681 else { 682 buffer.append("\\u"); 683 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 684 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 685 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 686 buffer.append(HEX_DIGIT[(c & 0x000F)]); 687 count += 6; 688 } 689 } 690 buffer.append('"'); 691 } 692 return buffer.toString(); 693 } 694 695 static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7', 696 '8','9','A','B','C','D','E','F'}; 697 698 /** 699 * Format a String for representation in a source file. Like 700 * formatForSource but does not do line breaking. 701 */ format1ForSource(String s)702 static public final String format1ForSource(String s) { 703 StringBuilder buffer = new StringBuilder(); 704 buffer.append("\""); 705 for (int i=0; i<s.length();) { 706 char c = s.charAt(i++); 707 if (c < '\u0020' || c == '"' || c == '\\') { 708 if (c == '\n') { 709 buffer.append("\\n"); 710 } else if (c == '\t') { 711 buffer.append("\\t"); 712 } else if (c == '\r') { 713 buffer.append("\\r"); 714 } else { 715 // Represent control characters, backslash and double quote 716 // using octal notation; otherwise the string we form 717 // won't compile, since Unicode escape sequences are 718 // processed before tokenization. 719 buffer.append('\\'); 720 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 721 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 722 buffer.append(HEX_DIGIT[(c & 0007)]); 723 } 724 } 725 else if (c <= '\u007E') { 726 buffer.append(c); 727 } 728 else { 729 buffer.append("\\u"); 730 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 731 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 732 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 733 buffer.append(HEX_DIGIT[(c & 0x000F)]); 734 } 735 } 736 buffer.append('"'); 737 return buffer.toString(); 738 } 739 740 /** 741 * Convert characters outside the range U+0020 to U+007F to 742 * Unicode escapes, and convert backslash to a double backslash. 743 */ escape(String s)744 public static final String escape(String s) { 745 StringBuilder buf = new StringBuilder(); 746 for (int i=0; i<s.length(); ) { 747 int c = Character.codePointAt(s, i); 748 i += UTF16.getCharCount(c); 749 if (c >= ' ' && c <= 0x007F) { 750 if (c == '\\') { 751 buf.append("\\\\"); // That is, "\\" 752 } else { 753 buf.append((char)c); 754 } 755 } else { 756 boolean four = c <= 0xFFFF; 757 buf.append(four ? "\\u" : "\\U"); 758 buf.append(hex(c, four ? 4 : 8)); 759 } 760 } 761 return buf.toString(); 762 } 763 764 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 765 static private final char[] UNESCAPE_MAP = { 766 /*" 0x22, 0x22 */ 767 /*' 0x27, 0x27 */ 768 /*? 0x3F, 0x3F */ 769 /*\ 0x5C, 0x5C */ 770 /*a*/ 0x61, 0x07, 771 /*b*/ 0x62, 0x08, 772 /*e*/ 0x65, 0x1b, 773 /*f*/ 0x66, 0x0c, 774 /*n*/ 0x6E, 0x0a, 775 /*r*/ 0x72, 0x0d, 776 /*t*/ 0x74, 0x09, 777 /*v*/ 0x76, 0x0b 778 }; 779 780 /** 781 * Convert an escape to a 32-bit code point value. We attempt 782 * to parallel the icu4c unescapeAt() function. 783 * @param offset16 an array containing offset to the character 784 * <em>after</em> the backslash. Upon return offset16[0] will 785 * be updated to point after the escape sequence. 786 * @return character value from 0 to 10FFFF, or -1 on error. 787 */ unescapeAt(String s, int[] offset16)788 public static int unescapeAt(String s, int[] offset16) { 789 int c; 790 int result = 0; 791 int n = 0; 792 int minDig = 0; 793 int maxDig = 0; 794 int bitsPerDigit = 4; 795 int dig; 796 int i; 797 boolean braces = false; 798 799 /* Check that offset is in range */ 800 int offset = offset16[0]; 801 int length = s.length(); 802 if (offset < 0 || offset >= length) { 803 return -1; 804 } 805 806 /* Fetch first UChar after '\\' */ 807 c = Character.codePointAt(s, offset); 808 offset += UTF16.getCharCount(c); 809 810 /* Convert hexadecimal and octal escapes */ 811 switch (c) { 812 case 'u': 813 minDig = maxDig = 4; 814 break; 815 case 'U': 816 minDig = maxDig = 8; 817 break; 818 case 'x': 819 minDig = 1; 820 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { 821 ++offset; 822 braces = true; 823 maxDig = 8; 824 } else { 825 maxDig = 2; 826 } 827 break; 828 default: 829 dig = UCharacter.digit(c, 8); 830 if (dig >= 0) { 831 minDig = 1; 832 maxDig = 3; 833 n = 1; /* Already have first octal digit */ 834 bitsPerDigit = 3; 835 result = dig; 836 } 837 break; 838 } 839 if (minDig != 0) { 840 while (offset < length && n < maxDig) { 841 c = UTF16.charAt(s, offset); 842 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); 843 if (dig < 0) { 844 break; 845 } 846 result = (result << bitsPerDigit) | dig; 847 offset += UTF16.getCharCount(c); 848 ++n; 849 } 850 if (n < minDig) { 851 return -1; 852 } 853 if (braces) { 854 if (c != 0x7D /*}*/) { 855 return -1; 856 } 857 ++offset; 858 } 859 if (result < 0 || result >= 0x110000) { 860 return -1; 861 } 862 // If an escape sequence specifies a lead surrogate, see 863 // if there is a trail surrogate after it, either as an 864 // escape or as a literal. If so, join them up into a 865 // supplementary. 866 if (offset < length && 867 UTF16.isLeadSurrogate((char) result)) { 868 int ahead = offset+1; 869 c = s.charAt(offset); // [sic] get 16-bit code unit 870 if (c == '\\' && ahead < length) { 871 int o[] = new int[] { ahead }; 872 c = unescapeAt(s, o); 873 ahead = o[0]; 874 } 875 if (UTF16.isTrailSurrogate((char) c)) { 876 offset = ahead; 877 result = Character.toCodePoint((char) result, (char) c); 878 } 879 } 880 offset16[0] = offset; 881 return result; 882 } 883 884 /* Convert C-style escapes in table */ 885 for (i=0; i<UNESCAPE_MAP.length; i+=2) { 886 if (c == UNESCAPE_MAP[i]) { 887 offset16[0] = offset; 888 return UNESCAPE_MAP[i+1]; 889 } else if (c < UNESCAPE_MAP[i]) { 890 break; 891 } 892 } 893 894 /* Map \cX to control-X: X & 0x1F */ 895 if (c == 'c' && offset < length) { 896 c = UTF16.charAt(s, offset); 897 offset16[0] = offset + UTF16.getCharCount(c); 898 return 0x1F & c; 899 } 900 901 /* If no special forms are recognized, then consider 902 * the backslash to generically escape the next character. */ 903 offset16[0] = offset; 904 return c; 905 } 906 907 /** 908 * Convert all escapes in a given string using unescapeAt(). 909 * @exception IllegalArgumentException if an invalid escape is 910 * seen. 911 */ unescape(String s)912 public static String unescape(String s) { 913 StringBuilder buf = new StringBuilder(); 914 int[] pos = new int[1]; 915 for (int i=0; i<s.length(); ) { 916 char c = s.charAt(i++); 917 if (c == '\\') { 918 pos[0] = i; 919 int e = unescapeAt(s, pos); 920 if (e < 0) { 921 throw new IllegalArgumentException("Invalid escape sequence " + 922 s.substring(i-1, Math.min(i+8, s.length()))); 923 } 924 buf.appendCodePoint(e); 925 i = pos[0]; 926 } else { 927 buf.append(c); 928 } 929 } 930 return buf.toString(); 931 } 932 933 /** 934 * Convert all escapes in a given string using unescapeAt(). 935 * Leave invalid escape sequences unchanged. 936 */ unescapeLeniently(String s)937 public static String unescapeLeniently(String s) { 938 StringBuilder buf = new StringBuilder(); 939 int[] pos = new int[1]; 940 for (int i=0; i<s.length(); ) { 941 char c = s.charAt(i++); 942 if (c == '\\') { 943 pos[0] = i; 944 int e = unescapeAt(s, pos); 945 if (e < 0) { 946 buf.append(c); 947 } else { 948 buf.appendCodePoint(e); 949 i = pos[0]; 950 } 951 } else { 952 buf.append(c); 953 } 954 } 955 return buf.toString(); 956 } 957 958 /** 959 * Convert a char to 4 hex uppercase digits. E.g., hex('a') => 960 * "0041". 961 */ hex(long ch)962 public static String hex(long ch) { 963 return hex(ch, 4); 964 } 965 966 /** 967 * Supplies a zero-padded hex representation of an integer (without 0x) 968 */ hex(long i, int places)969 static public String hex(long i, int places) { 970 if (i == Long.MIN_VALUE) return "-8000000000000000"; 971 boolean negative = i < 0; 972 if (negative) { 973 i = -i; 974 } 975 String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); 976 if (result.length() < places) { 977 result = "0000000000000000".substring(result.length(),places) + result; 978 } 979 if (negative) { 980 return '-' + result; 981 } 982 return result; 983 } 984 985 /** 986 * Convert a string to comma-separated groups of 4 hex uppercase 987 * digits. E.g., hex('ab') => "0041,0042". 988 */ 989 public static String hex(CharSequence s) { 990 return hex(s, 4, ",", true, new StringBuilder()).toString(); 991 } 992 993 /** 994 * Convert a string to separated groups of hex uppercase 995 * digits. E.g., hex('ab'...) => "0041,0042". Append the output 996 * to the given Appendable. 997 */ 998 public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) { 999 try { 1000 if (useCodePoints) { 1001 int cp; 1002 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1003 cp = Character.codePointAt(s, i); 1004 if (i != 0) { 1005 result.append(separator); 1006 } 1007 result.append(hex(cp,width)); 1008 } 1009 } else { 1010 for (int i = 0; i < s.length(); ++i) { 1011 if (i != 0) { 1012 result.append(separator); 1013 } 1014 result.append(hex(s.charAt(i),width)); 1015 } 1016 } 1017 return result; 1018 } catch (IOException e) { 1019 throw new IllegalIcuArgumentException(e); 1020 } 1021 } 1022 1023 public static String hex(byte[] o, int start, int end, String separator) { 1024 StringBuilder result = new StringBuilder(); 1025 //int ch; 1026 for (int i = start; i < end; ++i) { 1027 if (i != 0) result.append(separator); 1028 result.append(hex(o[i])); 1029 } 1030 return result.toString(); 1031 } 1032 1033 /** 1034 * Convert a string to comma-separated groups of 4 hex uppercase 1035 * digits. E.g., hex('ab') => "0041,0042". 1036 */ 1037 public static <S extends CharSequence> String hex(S s, int width, S separator) { 1038 return hex(s, width, separator, true, new StringBuilder()).toString(); 1039 } 1040 1041 /** 1042 * Split a string into pieces based on the given divider character 1043 * @param s the string to split 1044 * @param divider the character on which to split. Occurrences of 1045 * this character are not included in the output 1046 * @param output an array to receive the substrings between 1047 * instances of divider. It must be large enough on entry to 1048 * accommodate all output. Adjacent instances of the divider 1049 * character will place empty strings into output. Before 1050 * returning, output is padded out with empty strings. 1051 */ 1052 public static void split(String s, char divider, String[] output) { 1053 int last = 0; 1054 int current = 0; 1055 int i; 1056 for (i = 0; i < s.length(); ++i) { 1057 if (s.charAt(i) == divider) { 1058 output[current++] = s.substring(last,i); 1059 last = i+1; 1060 } 1061 } 1062 output[current++] = s.substring(last,i); 1063 while (current < output.length) { 1064 output[current++] = ""; 1065 } 1066 } 1067 1068 /** 1069 * Split a string into pieces based on the given divider character 1070 * @param s the string to split 1071 * @param divider the character on which to split. Occurrences of 1072 * this character are not included in the output 1073 * @return output an array to receive the substrings between 1074 * instances of divider. Adjacent instances of the divider 1075 * character will place empty strings into output. 1076 */ 1077 public static String[] split(String s, char divider) { 1078 int last = 0; 1079 int i; 1080 ArrayList<String> output = new ArrayList<>(); 1081 for (i = 0; i < s.length(); ++i) { 1082 if (s.charAt(i) == divider) { 1083 output.add(s.substring(last,i)); 1084 last = i+1; 1085 } 1086 } 1087 output.add( s.substring(last,i)); 1088 return output.toArray(new String[output.size()]); 1089 } 1090 1091 /** 1092 * Look up a given string in a string array. Returns the index at 1093 * which the first occurrence of the string was found in the 1094 * array, or -1 if it was not found. 1095 * @param source the string to search for 1096 * @param target the array of zero or more strings in which to 1097 * look for source 1098 * @return the index of target at which source first occurs, or -1 1099 * if not found 1100 */ 1101 public static int lookup(String source, String[] target) { 1102 for (int i = 0; i < target.length; ++i) { 1103 if (source.equals(target[i])) return i; 1104 } 1105 return -1; 1106 } 1107 1108 /** 1109 * Parse a single non-whitespace character 'ch', optionally 1110 * preceded by whitespace. 1111 * @param id the string to be parsed 1112 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 1113 * offset of the first character to be parsed. On output, pos[0] 1114 * is the index after the last parsed character. If the parse 1115 * fails, pos[0] will be unchanged. 1116 * @param ch the non-whitespace character to be parsed. 1117 * @return true if 'ch' is seen preceded by zero or more 1118 * whitespace characters. 1119 */ 1120 public static boolean parseChar(String id, int[] pos, char ch) { 1121 int start = pos[0]; 1122 pos[0] = PatternProps.skipWhiteSpace(id, pos[0]); 1123 if (pos[0] == id.length() || 1124 id.charAt(pos[0]) != ch) { 1125 pos[0] = start; 1126 return false; 1127 } 1128 ++pos[0]; 1129 return true; 1130 } 1131 1132 /** 1133 * Parse a pattern string starting at offset pos. Keywords are 1134 * matched case-insensitively. Spaces may be skipped and may be 1135 * optional or required. Integer values may be parsed, and if 1136 * they are, they will be returned in the given array. If 1137 * successful, the offset of the next non-space character is 1138 * returned. On failure, -1 is returned. 1139 * @param pattern must only contain lowercase characters, which 1140 * will match their uppercase equivalents as well. A space 1141 * character matches one or more required spaces. A '~' character 1142 * matches zero or more optional spaces. A '#' character matches 1143 * an integer and stores it in parsedInts, which the caller must 1144 * ensure has enough capacity. 1145 * @param parsedInts array to receive parsed integers. Caller 1146 * must ensure that parsedInts.length is >= the number of '#' 1147 * signs in 'pattern'. 1148 * @return the position after the last character parsed, or -1 if 1149 * the parse failed 1150 */ 1151 @SuppressWarnings("fallthrough") 1152 public static int parsePattern(String rule, int pos, int limit, 1153 String pattern, int[] parsedInts) { 1154 // TODO Update this to handle surrogates 1155 int[] p = new int[1]; 1156 int intCount = 0; // number of integers parsed 1157 for (int i=0; i<pattern.length(); ++i) { 1158 char cpat = pattern.charAt(i); 1159 char c; 1160 switch (cpat) { 1161 case ' ': 1162 if (pos >= limit) { 1163 return -1; 1164 } 1165 c = rule.charAt(pos++); 1166 if (!PatternProps.isWhiteSpace(c)) { 1167 return -1; 1168 } 1169 // FALL THROUGH to skipWhitespace 1170 case '~': 1171 pos = PatternProps.skipWhiteSpace(rule, pos); 1172 break; 1173 case '#': 1174 p[0] = pos; 1175 parsedInts[intCount++] = parseInteger(rule, p, limit); 1176 if (p[0] == pos) { 1177 // Syntax error; failed to parse integer 1178 return -1; 1179 } 1180 pos = p[0]; 1181 break; 1182 default: 1183 if (pos >= limit) { 1184 return -1; 1185 } 1186 c = (char) UCharacter.toLowerCase(rule.charAt(pos++)); 1187 if (c != cpat) { 1188 return -1; 1189 } 1190 break; 1191 } 1192 } 1193 return pos; 1194 } 1195 1196 /** 1197 * Parse a pattern string within the given Replaceable and a parsing 1198 * pattern. Characters are matched literally and case-sensitively 1199 * except for the following special characters: 1200 * 1201 * ~ zero or more Pattern_White_Space chars 1202 * 1203 * If end of pattern is reached with all matches along the way, 1204 * pos is advanced to the first unparsed index and returned. 1205 * Otherwise -1 is returned. 1206 * @param pat pattern that controls parsing 1207 * @param text text to be parsed, starting at index 1208 * @param index offset to first character to parse 1209 * @param limit offset after last character to parse 1210 * @return index after last parsed character, or -1 on parse failure. 1211 */ 1212 public static int parsePattern(String pat, 1213 Replaceable text, 1214 int index, 1215 int limit) { 1216 int ipat = 0; 1217 1218 // empty pattern matches immediately 1219 if (ipat == pat.length()) { 1220 return index; 1221 } 1222 1223 int cpat = Character.codePointAt(pat, ipat); 1224 1225 while (index < limit) { 1226 int c = text.char32At(index); 1227 1228 // parse \s* 1229 if (cpat == '~') { 1230 if (PatternProps.isWhiteSpace(c)) { 1231 index += UTF16.getCharCount(c); 1232 continue; 1233 } else { 1234 if (++ipat == pat.length()) { 1235 return index; // success; c unparsed 1236 } 1237 // fall thru; process c again with next cpat 1238 } 1239 } 1240 1241 // parse literal 1242 else if (c == cpat) { 1243 int n = UTF16.getCharCount(c); 1244 index += n; 1245 ipat += n; 1246 if (ipat == pat.length()) { 1247 return index; // success; c parsed 1248 } 1249 // fall thru; get next cpat 1250 } 1251 1252 // match failure of literal 1253 else { 1254 return -1; 1255 } 1256 1257 cpat = UTF16.charAt(pat, ipat); 1258 } 1259 1260 return -1; // text ended before end of pat 1261 } 1262 1263 /** 1264 * Parse an integer at pos, either of the form \d+ or of the form 1265 * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex, 1266 * or octal format. 1267 * @param pos INPUT-OUTPUT parameter. On input, the first 1268 * character to parse. On output, the character after the last 1269 * parsed character. 1270 */ 1271 public static int parseInteger(String rule, int[] pos, int limit) { 1272 int count = 0; 1273 int value = 0; 1274 int p = pos[0]; 1275 int radix = 10; 1276 1277 if (rule.regionMatches(true, p, "0x", 0, 2)) { 1278 p += 2; 1279 radix = 16; 1280 } else if (p < limit && rule.charAt(p) == '0') { 1281 p++; 1282 count = 1; 1283 radix = 8; 1284 } 1285 1286 while (p < limit) { 1287 int d = UCharacter.digit(rule.charAt(p++), radix); 1288 if (d < 0) { 1289 --p; 1290 break; 1291 } 1292 ++count; 1293 int v = (value * radix) + d; 1294 if (v <= value) { 1295 // If there are too many input digits, at some point 1296 // the value will go negative, e.g., if we have seen 1297 // "0x8000000" already and there is another '0', when 1298 // we parse the next 0 the value will go negative. 1299 return 0; 1300 } 1301 value = v; 1302 } 1303 if (count > 0) { 1304 pos[0] = p; 1305 } 1306 return value; 1307 } 1308 1309 /** 1310 * Parse a Unicode identifier from the given string at the given 1311 * position. Return the identifier, or null if there is no 1312 * identifier. 1313 * @param str the string to parse 1314 * @param pos INPUT-OUPUT parameter. On INPUT, pos[0] is the 1315 * first character to examine. It must be less than str.length(), 1316 * and it must not point to a whitespace character. That is, must 1317 * have pos[0] < str.length(). On 1318 * OUTPUT, the position after the last parsed character. 1319 * @return the Unicode identifier, or null if there is no valid 1320 * identifier at pos[0]. 1321 */ 1322 public static String parseUnicodeIdentifier(String str, int[] pos) { 1323 // assert(pos[0] < str.length()); 1324 StringBuilder buf = new StringBuilder(); 1325 int p = pos[0]; 1326 while (p < str.length()) { 1327 int ch = Character.codePointAt(str, p); 1328 if (buf.length() == 0) { 1329 if (UCharacter.isUnicodeIdentifierStart(ch)) { 1330 buf.appendCodePoint(ch); 1331 } else { 1332 return null; 1333 } 1334 } else { 1335 if (UCharacter.isUnicodeIdentifierPart(ch)) { 1336 buf.appendCodePoint(ch); 1337 } else { 1338 break; 1339 } 1340 } 1341 p += UTF16.getCharCount(ch); 1342 } 1343 pos[0] = p; 1344 return buf.toString(); 1345 } 1346 1347 static final char DIGITS[] = { 1348 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 1349 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 1350 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 1351 'U', 'V', 'W', 'X', 'Y', 'Z' 1352 }; 1353 1354 /** 1355 * Append the digits of a positive integer to the given 1356 * <code>Appendable</code> in the given radix. This is 1357 * done recursively since it is easiest to generate the low- 1358 * order digit first, but it must be appended last. 1359 * 1360 * @param result is the <code>Appendable</code> to append to 1361 * @param n is the positive integer 1362 * @param radix is the radix, from 2 to 36 inclusive 1363 * @param minDigits is the minimum number of digits to append. 1364 */ 1365 private static <T extends Appendable> void recursiveAppendNumber(T result, int n, 1366 int radix, int minDigits) 1367 { 1368 try { 1369 int digit = n % radix; 1370 1371 if (n >= radix || minDigits > 1) { 1372 recursiveAppendNumber(result, n / radix, radix, minDigits - 1); 1373 } 1374 result.append(DIGITS[digit]); 1375 } catch (IOException e) { 1376 throw new IllegalIcuArgumentException(e); 1377 } 1378 } 1379 1380 /** 1381 * Append a number to the given Appendable in the given radix. 1382 * Standard digits '0'-'9' are used and letters 'A'-'Z' for 1383 * radices 11 through 36. 1384 * @param result the digits of the number are appended here 1385 * @param n the number to be converted to digits; may be negative. 1386 * If negative, a '-' is prepended to the digits. 1387 * @param radix a radix from 2 to 36 inclusive. 1388 * @param minDigits the minimum number of digits, not including 1389 * any '-', to produce. Values less than 2 have no effect. One 1390 * digit is always emitted regardless of this parameter. 1391 * @return a reference to result 1392 */ 1393 public static <T extends Appendable> T appendNumber(T result, int n, 1394 int radix, int minDigits) 1395 { 1396 try { 1397 if (radix < 2 || radix > 36) { 1398 throw new IllegalArgumentException("Illegal radix " + radix); 1399 } 1400 1401 1402 int abs = n; 1403 1404 if (n < 0) { 1405 abs = -n; 1406 result.append("-"); 1407 } 1408 1409 recursiveAppendNumber(result, abs, radix, minDigits); 1410 1411 return result; 1412 } catch (IOException e) { 1413 throw new IllegalIcuArgumentException(e); 1414 } 1415 1416 } 1417 1418 /** 1419 * Parse an unsigned 31-bit integer at the given offset. Use 1420 * UCharacter.digit() to parse individual characters into digits. 1421 * @param text the text to be parsed 1422 * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the 1423 * offset within text at which to start parsing; it should point 1424 * to a valid digit. On exit, pos[0] is the offset after the last 1425 * parsed character. If the parse failed, it will be unchanged on 1426 * exit. Must be >= 0 on entry. 1427 * @param radix the radix in which to parse; must be >= 2 and <= 1428 * 36. 1429 * @return a non-negative parsed number, or -1 upon parse failure. 1430 * Parse fails if there are no digits, that is, if pos[0] does not 1431 * point to a valid digit on entry, or if the number to be parsed 1432 * does not fit into a 31-bit unsigned integer. 1433 */ 1434 public static int parseNumber(String text, int[] pos, int radix) { 1435 // assert(pos[0] >= 0); 1436 // assert(radix >= 2); 1437 // assert(radix <= 36); 1438 int n = 0; 1439 int p = pos[0]; 1440 while (p < text.length()) { 1441 int ch = Character.codePointAt(text, p); 1442 int d = UCharacter.digit(ch, radix); 1443 if (d < 0) { 1444 break; 1445 } 1446 n = radix*n + d; 1447 // ASSUME that when a 32-bit integer overflows it becomes 1448 // negative. E.g., 214748364 * 10 + 8 => negative value. 1449 if (n < 0) { 1450 return -1; 1451 } 1452 ++p; 1453 } 1454 if (p == pos[0]) { 1455 return -1; 1456 } 1457 pos[0] = p; 1458 return n; 1459 } 1460 1461 /** 1462 * Return true if the character is NOT printable ASCII. The tab, 1463 * newline and linefeed characters are considered unprintable. 1464 */ 1465 public static boolean isUnprintable(int c) { 1466 //0x20 = 32 and 0x7E = 126 1467 return !(c >= 0x20 && c <= 0x7E); 1468 } 1469 1470 /** 1471 * Escape unprintable characters using <backslash>uxxxx notation 1472 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and 1473 * above. If the character is printable ASCII, then do nothing 1474 * and return FALSE. Otherwise, append the escaped notation and 1475 * return TRUE. 1476 */ 1477 public static <T extends Appendable> boolean escapeUnprintable(T result, int c) { 1478 try { 1479 if (isUnprintable(c)) { 1480 result.append('\\'); 1481 if ((c & ~0xFFFF) != 0) { 1482 result.append('U'); 1483 result.append(DIGITS[0xF&(c>>28)]); 1484 result.append(DIGITS[0xF&(c>>24)]); 1485 result.append(DIGITS[0xF&(c>>20)]); 1486 result.append(DIGITS[0xF&(c>>16)]); 1487 } else { 1488 result.append('u'); 1489 } 1490 result.append(DIGITS[0xF&(c>>12)]); 1491 result.append(DIGITS[0xF&(c>>8)]); 1492 result.append(DIGITS[0xF&(c>>4)]); 1493 result.append(DIGITS[0xF&c]); 1494 return true; 1495 } 1496 return false; 1497 } catch (IOException e) { 1498 throw new IllegalIcuArgumentException(e); 1499 } 1500 } 1501 1502 /** 1503 * Returns the index of the first character in a set, ignoring quoted text. 1504 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be 1505 * found by a search for "h". Unlike String.indexOf(), this method searches 1506 * not for a single character, but for any character of the string 1507 * <code>setOfChars</code>. 1508 * @param text text to be searched 1509 * @param start the beginning index, inclusive; <code>0 <= start 1510 * <= limit</code>. 1511 * @param limit the ending index, exclusive; <code>start <= limit 1512 * <= text.length()</code>. 1513 * @param setOfChars string with one or more distinct characters 1514 * @return Offset of the first character in <code>setOfChars</code> 1515 * found, or -1 if not found. 1516 * @see String#indexOf 1517 */ 1518 public static int quotedIndexOf(String text, int start, int limit, 1519 String setOfChars) { 1520 for (int i=start; i<limit; ++i) { 1521 char c = text.charAt(i); 1522 if (c == BACKSLASH) { 1523 ++i; 1524 } else if (c == APOSTROPHE) { 1525 while (++i < limit 1526 && text.charAt(i) != APOSTROPHE) {} 1527 } else if (setOfChars.indexOf(c) >= 0) { 1528 return i; 1529 } 1530 } 1531 return -1; 1532 } 1533 1534 /** 1535 * Append a character to a rule that is being built up. To flush 1536 * the quoteBuf to rule, make one final call with isLiteral == true. 1537 * If there is no final character, pass in (int)-1 as c. 1538 * @param rule the string to append the character to 1539 * @param c the character to append, or (int)-1 if none. 1540 * @param isLiteral if true, then the given character should not be 1541 * quoted or escaped. Usually this means it is a syntactic element 1542 * such as > or $ 1543 * @param escapeUnprintable if true, then unprintable characters 1544 * should be escaped using escapeUnprintable(). These escapes will 1545 * appear outside of quotes. 1546 * @param quoteBuf a buffer which is used to build up quoted 1547 * substrings. The caller should initially supply an empty buffer, 1548 * and thereafter should not modify the buffer. The buffer should be 1549 * cleared out by, at the end, calling this method with a literal 1550 * character (which may be -1). 1551 */ 1552 public static void appendToRule(StringBuffer rule, 1553 int c, 1554 boolean isLiteral, 1555 boolean escapeUnprintable, 1556 StringBuffer quoteBuf) { 1557 // If we are escaping unprintables, then escape them outside 1558 // quotes. \\u and \\U are not recognized within quotes. The same 1559 // logic applies to literals, but literals are never escaped. 1560 if (isLiteral || 1561 (escapeUnprintable && Utility.isUnprintable(c))) { 1562 if (quoteBuf.length() > 0) { 1563 // We prefer backslash APOSTROPHE to double APOSTROPHE 1564 // (more readable, less similar to ") so if there are 1565 // double APOSTROPHEs at the ends, we pull them outside 1566 // of the quote. 1567 1568 // If the first thing in the quoteBuf is APOSTROPHE 1569 // (doubled) then pull it out. 1570 while (quoteBuf.length() >= 2 && 1571 quoteBuf.charAt(0) == APOSTROPHE && 1572 quoteBuf.charAt(1) == APOSTROPHE) { 1573 rule.append(BACKSLASH).append(APOSTROPHE); 1574 quoteBuf.delete(0, 2); 1575 } 1576 // If the last thing in the quoteBuf is APOSTROPHE 1577 // (doubled) then remove and count it and add it after. 1578 int trailingCount = 0; 1579 while (quoteBuf.length() >= 2 && 1580 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE && 1581 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) { 1582 quoteBuf.setLength(quoteBuf.length()-2); 1583 ++trailingCount; 1584 } 1585 if (quoteBuf.length() > 0) { 1586 rule.append(APOSTROPHE); 1587 rule.append(quoteBuf); 1588 rule.append(APOSTROPHE); 1589 quoteBuf.setLength(0); 1590 } 1591 while (trailingCount-- > 0) { 1592 rule.append(BACKSLASH).append(APOSTROPHE); 1593 } 1594 } 1595 if (c != -1) { 1596 /* Since spaces are ignored during parsing, they are 1597 * emitted only for readability. We emit one here 1598 * only if there isn't already one at the end of the 1599 * rule. 1600 */ 1601 if (c == ' ') { 1602 int len = rule.length(); 1603 if (len > 0 && rule.charAt(len-1) != ' ') { 1604 rule.append(' '); 1605 } 1606 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) { 1607 rule.appendCodePoint(c); 1608 } 1609 } 1610 } 1611 1612 // Escape ' and '\' and don't begin a quote just for them 1613 else if (quoteBuf.length() == 0 && 1614 (c == APOSTROPHE || c == BACKSLASH)) { 1615 rule.append(BACKSLASH).append((char)c); 1616 } 1617 1618 // Specials (printable ascii that isn't [0-9a-zA-Z]) and 1619 // whitespace need quoting. Also append stuff to quotes if we are 1620 // building up a quoted substring already. 1621 else if (quoteBuf.length() > 0 || 1622 (c >= 0x0021 && c <= 0x007E && 1623 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 1624 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 1625 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || 1626 PatternProps.isWhiteSpace(c)) { 1627 quoteBuf.appendCodePoint(c); 1628 // Double ' within a quote 1629 if (c == APOSTROPHE) { 1630 quoteBuf.append((char)c); 1631 } 1632 } 1633 1634 // Otherwise just append 1635 else { 1636 rule.appendCodePoint(c); 1637 } 1638 } 1639 1640 /** 1641 * Append the given string to the rule. Calls the single-character 1642 * version of appendToRule for each character. 1643 */ 1644 public static void appendToRule(StringBuffer rule, 1645 String text, 1646 boolean isLiteral, 1647 boolean escapeUnprintable, 1648 StringBuffer quoteBuf) { 1649 for (int i=0; i<text.length(); ++i) { 1650 // Okay to process in 16-bit code units here 1651 appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf); 1652 } 1653 } 1654 1655 /** 1656 * Given a matcher reference, which may be null, append its 1657 * pattern as a literal to the given rule. 1658 */ 1659 public static void appendToRule(StringBuffer rule, 1660 UnicodeMatcher matcher, 1661 boolean escapeUnprintable, 1662 StringBuffer quoteBuf) { 1663 if (matcher != null) { 1664 appendToRule(rule, matcher.toPattern(escapeUnprintable), 1665 true, escapeUnprintable, quoteBuf); 1666 } 1667 } 1668 1669 /** 1670 * Compares 2 unsigned integers 1671 * @param source 32 bit unsigned integer 1672 * @param target 32 bit unsigned integer 1673 * @return 0 if equals, 1 if source is greater than target and -1 1674 * otherwise 1675 */ 1676 public static final int compareUnsigned(int source, int target) 1677 { 1678 source += MAGIC_UNSIGNED; 1679 target += MAGIC_UNSIGNED; 1680 if (source < target) { 1681 return -1; 1682 } 1683 else if (source > target) { 1684 return 1; 1685 } 1686 return 0; 1687 } 1688 1689 /** 1690 * Find the highest bit in a positive integer. This is done 1691 * by doing a binary search through the bits. 1692 * 1693 * @param n is the integer 1694 * 1695 * @return the bit number of the highest bit, with 0 being 1696 * the low order bit, or -1 if <code>n</code> is not positive 1697 */ 1698 public static final byte highBit(int n) 1699 { 1700 if (n <= 0) { 1701 return -1; 1702 } 1703 1704 byte bit = 0; 1705 1706 if (n >= 1 << 16) { 1707 n >>= 16; 1708 bit += 16; 1709 } 1710 1711 if (n >= 1 << 8) { 1712 n >>= 8; 1713 bit += 8; 1714 } 1715 1716 if (n >= 1 << 4) { 1717 n >>= 4; 1718 bit += 4; 1719 } 1720 1721 if (n >= 1 << 2) { 1722 n >>= 2; 1723 bit += 2; 1724 } 1725 1726 if (n >= 1 << 1) { 1727 n >>= 1; 1728 bit += 1; 1729 } 1730 1731 return bit; 1732 } 1733 /** 1734 * Utility method to take a int[] containing codepoints and return 1735 * a string representation with code units. 1736 */ 1737 public static String valueOf(int[]source){ 1738 // TODO: Investigate why this method is not on UTF16 class 1739 StringBuilder result = new StringBuilder(source.length); 1740 for(int i=0; i<source.length; i++){ 1741 result.appendCodePoint(source[i]); 1742 } 1743 return result.toString(); 1744 } 1745 1746 1747 /** 1748 * Utility to duplicate a string count times 1749 * @param s String to be duplicated. 1750 * @param count Number of times to duplicate a string. 1751 */ 1752 public static String repeat(String s, int count) { 1753 if (count <= 0) return ""; 1754 if (count == 1) return s; 1755 StringBuilder result = new StringBuilder(); 1756 for (int i = 0; i < count; ++i) { 1757 result.append(s); 1758 } 1759 return result.toString(); 1760 } 1761 1762 public static String[] splitString(String src, String target) { 1763 return src.split("\\Q" + target + "\\E"); 1764 } 1765 1766 /** 1767 * Split the string at runs of ascii whitespace characters. 1768 */ 1769 public static String[] splitWhitespace(String src) { 1770 return src.split("\\s+"); 1771 } 1772 1773 /** 1774 * Parse a list of hex numbers and return a string 1775 * @param string String of hex numbers. 1776 * @param minLength Minimal length. 1777 * @param separator Separator. 1778 * @return A string from hex numbers. 1779 */ 1780 public static String fromHex(String string, int minLength, String separator) { 1781 return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+")); 1782 } 1783 1784 /** 1785 * Parse a list of hex numbers and return a string 1786 * @param string String of hex numbers. 1787 * @param minLength Minimal length. 1788 * @param separator Separator. 1789 * @return A string from hex numbers. 1790 */ 1791 public static String fromHex(String string, int minLength, Pattern separator) { 1792 StringBuilder buffer = new StringBuilder(); 1793 String[] parts = separator.split(string); 1794 for (String part : parts) { 1795 if (part.length() < minLength) { 1796 throw new IllegalArgumentException("code point too short: " + part); 1797 } 1798 int cp = Integer.parseInt(part, 16); 1799 buffer.appendCodePoint(cp); 1800 } 1801 return buffer.toString(); 1802 } 1803 1804 /** 1805 * This implementation is equivalent to Java 8+ Math#addExact(int, int) 1806 * @param x the first value 1807 * @param y the second value 1808 * @return the result 1809 */ 1810 public static int addExact(int x, int y) { 1811 int r = x + y; 1812 // HD 2-12 Overflow iff both arguments have the opposite sign of the result 1813 if (((x ^ r) & (y ^ r)) < 0) { 1814 throw new ArithmeticException("integer overflow"); 1815 } 1816 return r; 1817 } 1818 1819 /** 1820 * Returns whether the chars in the two CharSequences are equal. 1821 */ 1822 public static boolean charSequenceEquals(CharSequence a, CharSequence b) { 1823 if (a == b) { 1824 return true; 1825 } 1826 if (a == null || b == null) { 1827 return false; 1828 } 1829 if (a.length() != b.length()) { 1830 return false; 1831 } 1832 for (int i = 0; i < a.length(); i++) { 1833 if (a.charAt(i) != b.charAt(i)) 1834 return false; 1835 } 1836 return true; 1837 } 1838 1839 /** 1840 * Returns a hash code for a CharSequence that is equivalent to calling 1841 * charSequence.toString().hashCode() 1842 */ 1843 public static int charSequenceHashCode(CharSequence value) { 1844 int hash = 0; 1845 for (int i = 0; i < value.length(); i++) { 1846 hash = hash * 31 + value.charAt(i); 1847 } 1848 return hash; 1849 } 1850 1851 /** 1852 * Appends a CharSequence to an Appendable, converting IOException to ICUUncheckedIOException. 1853 */ 1854 public static <A extends Appendable> A appendTo(CharSequence string, A appendable) { 1855 try { 1856 appendable.append(string); 1857 return appendable; 1858 } catch (IOException e) { 1859 throw new ICUUncheckedIOException(e); 1860 } 1861 } 1862 1863 /** 1864 * Java 8+ String#join(CharSequence, Iterable<? extends CharSequence>) compatible method for Java 7 env. 1865 * @param delimiter the delimiter that separates each element 1866 * @param elements the elements to join together. 1867 * @return a new String that is composed of the elements separated by the delimiter 1868 * @throws NullPointerException If delimiter or elements is null 1869 */ 1870 public static String joinStrings(CharSequence delimiter, Iterable<? extends CharSequence> elements) { 1871 if (delimiter == null || elements == null) { 1872 throw new NullPointerException("Delimiter or elements is null"); 1873 } 1874 StringBuilder buf = new StringBuilder(); 1875 Iterator<? extends CharSequence> itr = elements.iterator(); 1876 boolean isFirstElem = true; 1877 while (itr.hasNext()) { 1878 CharSequence element = itr.next(); 1879 if (element != null) { 1880 if (!isFirstElem) { 1881 buf.append(delimiter); 1882 } else { 1883 isFirstElem = false; 1884 } 1885 buf.append(element); 1886 } 1887 } 1888 return buf.toString(); 1889 } 1890 } 1891