1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import java.io.IOException; 12 import java.util.ArrayList; 13 import java.util.Locale; 14 import java.util.regex.Pattern; 15 16 import com.ibm.icu.lang.UCharacter; 17 import com.ibm.icu.text.Replaceable; 18 import com.ibm.icu.text.UTF16; 19 import com.ibm.icu.text.UnicodeMatcher; 20 21 public final class Utility { 22 23 private static final char APOSTROPHE = '\''; 24 private static final char BACKSLASH = '\\'; 25 private static final int MAGIC_UNSIGNED = 0x80000000; 26 27 /** 28 * Convenience utility to compare two Object[]s. 29 * Ought to be in System 30 */ arrayEquals(Object[] source, Object target)31 public final static boolean arrayEquals(Object[] source, Object target) { 32 if (source == null) return (target == null); 33 if (!(target instanceof Object[])) return false; 34 Object[] targ = (Object[]) target; 35 return (source.length == targ.length 36 && arrayRegionMatches(source, 0, targ, 0, source.length)); 37 } 38 39 /** 40 * Convenience utility to compare two int[]s 41 * Ought to be in System 42 */ arrayEquals(int[] source, Object target)43 public final static boolean arrayEquals(int[] source, Object target) { 44 if (source == null) return (target == null); 45 if (!(target instanceof int[])) return false; 46 int[] targ = (int[]) target; 47 return (source.length == targ.length 48 && arrayRegionMatches(source, 0, targ, 0, source.length)); 49 } 50 51 /** 52 * Convenience utility to compare two double[]s 53 * Ought to be in System 54 */ arrayEquals(double[] source, Object target)55 public final static boolean arrayEquals(double[] source, Object target) { 56 if (source == null) return (target == null); 57 if (!(target instanceof double[])) return false; 58 double[] targ = (double[]) target; 59 return (source.length == targ.length 60 && arrayRegionMatches(source, 0, targ, 0, source.length)); 61 } arrayEquals(byte[] source, Object target)62 public final static boolean arrayEquals(byte[] source, Object target) { 63 if (source == null) return (target == null); 64 if (!(target instanceof byte[])) return false; 65 byte[] targ = (byte[]) target; 66 return (source.length == targ.length 67 && arrayRegionMatches(source, 0, targ, 0, source.length)); 68 } 69 70 /** 71 * Convenience utility to compare two Object[]s 72 * Ought to be in System 73 */ arrayEquals(Object source, Object target)74 public final static boolean arrayEquals(Object source, Object target) { 75 if (source == null) return (target == null); 76 // for some reason, the correct arrayEquals is not being called 77 // so do it by hand for now. 78 if (source instanceof Object[]) 79 return(arrayEquals((Object[]) source,target)); 80 if (source instanceof int[]) 81 return(arrayEquals((int[]) source,target)); 82 if (source instanceof double[]) 83 return(arrayEquals((double[]) source, target)); 84 if (source instanceof byte[]) 85 return(arrayEquals((byte[]) source,target)); 86 return source.equals(target); 87 } 88 89 /** 90 * Convenience utility to compare two Object[]s 91 * Ought to be in System. 92 * @param len the length to compare. 93 * The start indices and start+len must be valid. 94 */ arrayRegionMatches(Object[] source, int sourceStart, Object[] target, int targetStart, int len)95 public final static boolean arrayRegionMatches(Object[] source, int sourceStart, 96 Object[] target, int targetStart, 97 int len) 98 { 99 int sourceEnd = sourceStart + len; 100 int delta = targetStart - sourceStart; 101 for (int i = sourceStart; i < sourceEnd; i++) { 102 if (!arrayEquals(source[i],target[i + delta])) 103 return false; 104 } 105 return true; 106 } 107 108 /** 109 * Convenience utility to compare two Object[]s 110 * Ought to be in System. 111 * @param len the length to compare. 112 * The start indices and start+len must be valid. 113 */ arrayRegionMatches(char[] source, int sourceStart, char[] target, int targetStart, int len)114 public final static boolean arrayRegionMatches(char[] source, int sourceStart, 115 char[] target, int targetStart, 116 int len) 117 { 118 int sourceEnd = sourceStart + len; 119 int delta = targetStart - sourceStart; 120 for (int i = sourceStart; i < sourceEnd; i++) { 121 if (source[i]!=target[i + delta]) 122 return false; 123 } 124 return true; 125 } 126 127 /** 128 * Convenience utility to compare two int[]s. 129 * @param len the length to compare. 130 * The start indices and start+len must be valid. 131 * Ought to be in System 132 */ arrayRegionMatches(int[] source, int sourceStart, int[] target, int targetStart, int len)133 public final static boolean arrayRegionMatches(int[] source, int sourceStart, 134 int[] target, int targetStart, 135 int len) 136 { 137 int sourceEnd = sourceStart + len; 138 int delta = targetStart - sourceStart; 139 for (int i = sourceStart; i < sourceEnd; i++) { 140 if (source[i] != target[i + delta]) 141 return false; 142 } 143 return true; 144 } 145 146 /** 147 * Convenience utility to compare two arrays of doubles. 148 * @param len the length to compare. 149 * The start indices and start+len must be valid. 150 * Ought to be in System 151 */ arrayRegionMatches(double[] source, int sourceStart, double[] target, int targetStart, int len)152 public final static boolean arrayRegionMatches(double[] source, int sourceStart, 153 double[] target, int targetStart, 154 int len) 155 { 156 int sourceEnd = sourceStart + len; 157 int delta = targetStart - sourceStart; 158 for (int i = sourceStart; i < sourceEnd; i++) { 159 if (source[i] != target[i + delta]) 160 return false; 161 } 162 return true; 163 } arrayRegionMatches(byte[] source, int sourceStart, byte[] target, int targetStart, int len)164 public final static boolean arrayRegionMatches(byte[] source, int sourceStart, 165 byte[] target, int targetStart, int len){ 166 int sourceEnd = sourceStart + len; 167 int delta = targetStart - sourceStart; 168 for (int i = sourceStart; i < sourceEnd; i++) { 169 if (source[i] != target[i + delta]) 170 return false; 171 } 172 return true; 173 } 174 175 /** 176 * Trivial reference equality. 177 * This method should help document that we really want == not equals(), 178 * and to have a single place to suppress warnings from static analysis tools. 179 */ sameObjects(Object a, Object b)180 public static final boolean sameObjects(Object a, Object b) { 181 return a == b; 182 } 183 184 /** 185 * Convenience utility. Does null checks on objects, then calls compare. 186 */ checkCompare(T a, T b)187 public static <T extends Comparable<T>> int checkCompare(T a, T b) { 188 return a == null ? 189 b == null ? 0 : -1 : 190 b == null ? 1 : a.compareTo(b); 191 } 192 193 /** 194 * Convenience utility. Does null checks on object, then calls hashCode. 195 */ checkHash(Object a)196 public static int checkHash(Object a) { 197 return a == null ? 0 : a.hashCode(); 198 } 199 200 /** 201 * The ESCAPE character is used during run-length encoding. It signals 202 * a run of identical chars. 203 */ 204 private static final char ESCAPE = '\uA5A5'; 205 206 /** 207 * The ESCAPE_BYTE character is used during run-length encoding. It signals 208 * a run of identical bytes. 209 */ 210 static final byte ESCAPE_BYTE = (byte)0xA5; 211 212 /** 213 * Construct a string representing an int array. Use run-length encoding. 214 * A character represents itself, unless it is the ESCAPE character. Then 215 * the following notations are possible: 216 * ESCAPE ESCAPE ESCAPE literal 217 * ESCAPE n c n instances of character c 218 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 219 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 220 * If we encounter a run where n == ESCAPE, we represent this as: 221 * c ESCAPE n-1 c 222 * The ESCAPE value is chosen so as not to collide with commonly 223 * seen values. 224 */ arrayToRLEString(int[] a)225 static public final String arrayToRLEString(int[] a) { 226 StringBuilder buffer = new StringBuilder(); 227 228 appendInt(buffer, a.length); 229 int runValue = a[0]; 230 int runLength = 1; 231 for (int i=1; i<a.length; ++i) { 232 int s = a[i]; 233 if (s == runValue && runLength < 0xFFFF) { 234 ++runLength; 235 } else { 236 encodeRun(buffer, runValue, runLength); 237 runValue = s; 238 runLength = 1; 239 } 240 } 241 encodeRun(buffer, runValue, runLength); 242 return buffer.toString(); 243 } 244 245 /** 246 * Construct a string representing a short array. Use run-length encoding. 247 * A character represents itself, unless it is the ESCAPE character. Then 248 * the following notations are possible: 249 * ESCAPE ESCAPE ESCAPE literal 250 * ESCAPE n c n instances of character c 251 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 252 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 253 * If we encounter a run where n == ESCAPE, we represent this as: 254 * c ESCAPE n-1 c 255 * The ESCAPE value is chosen so as not to collide with commonly 256 * seen values. 257 */ arrayToRLEString(short[] a)258 static public final String arrayToRLEString(short[] a) { 259 StringBuilder buffer = new StringBuilder(); 260 // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]); 261 buffer.append((char) (a.length >> 16)); 262 buffer.append((char) a.length); 263 short runValue = a[0]; 264 int runLength = 1; 265 for (int i=1; i<a.length; ++i) { 266 short s = a[i]; 267 if (s == runValue && runLength < 0xFFFF) ++runLength; 268 else { 269 encodeRun(buffer, runValue, runLength); 270 runValue = s; 271 runLength = 1; 272 } 273 } 274 encodeRun(buffer, runValue, runLength); 275 return buffer.toString(); 276 } 277 278 /** 279 * Construct a string representing a char array. Use run-length encoding. 280 * A character represents itself, unless it is the ESCAPE character. Then 281 * the following notations are possible: 282 * ESCAPE ESCAPE ESCAPE literal 283 * ESCAPE n c n instances of character c 284 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 285 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 286 * If we encounter a run where n == ESCAPE, we represent this as: 287 * c ESCAPE n-1 c 288 * The ESCAPE value is chosen so as not to collide with commonly 289 * seen values. 290 */ arrayToRLEString(char[] a)291 static public final String arrayToRLEString(char[] a) { 292 StringBuilder buffer = new StringBuilder(); 293 buffer.append((char) (a.length >> 16)); 294 buffer.append((char) a.length); 295 char runValue = a[0]; 296 int runLength = 1; 297 for (int i=1; i<a.length; ++i) { 298 char s = a[i]; 299 if (s == runValue && runLength < 0xFFFF) ++runLength; 300 else { 301 encodeRun(buffer, (short)runValue, runLength); 302 runValue = s; 303 runLength = 1; 304 } 305 } 306 encodeRun(buffer, (short)runValue, runLength); 307 return buffer.toString(); 308 } 309 310 /** 311 * Construct a string representing a byte array. Use run-length encoding. 312 * Two bytes are packed into a single char, with a single extra zero byte at 313 * the end if needed. A byte represents itself, unless it is the 314 * ESCAPE_BYTE. Then the following notations are possible: 315 * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal 316 * ESCAPE_BYTE n b n instances of byte b 317 * Since an encoded run occupies 3 bytes, we only encode runs of 4 or 318 * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF. 319 * If we encounter a run where n == ESCAPE_BYTE, we represent this as: 320 * b ESCAPE_BYTE n-1 b 321 * The ESCAPE_BYTE value is chosen so as not to collide with commonly 322 * seen values. 323 */ arrayToRLEString(byte[] a)324 static public final String arrayToRLEString(byte[] a) { 325 StringBuilder buffer = new StringBuilder(); 326 buffer.append((char) (a.length >> 16)); 327 buffer.append((char) a.length); 328 byte runValue = a[0]; 329 int runLength = 1; 330 byte[] state = new byte[2]; 331 for (int i=1; i<a.length; ++i) { 332 byte b = a[i]; 333 if (b == runValue && runLength < 0xFF) ++runLength; 334 else { 335 encodeRun(buffer, runValue, runLength, state); 336 runValue = b; 337 runLength = 1; 338 } 339 } 340 encodeRun(buffer, runValue, runLength, state); 341 342 // We must save the final byte, if there is one, by padding 343 // an extra zero. 344 if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state); 345 346 return buffer.toString(); 347 } 348 349 /** 350 * Encode a run, possibly a degenerate run (of < 4 values). 351 * @param length The length of the run; must be > 0 && <= 0xFFFF. 352 */ encodeRun(T buffer, int value, int length)353 private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) { 354 if (length < 4) { 355 for (int j=0; j<length; ++j) { 356 if (value == ESCAPE) { 357 appendInt(buffer, value); 358 } 359 appendInt(buffer, value); 360 } 361 } 362 else { 363 if (length == ESCAPE) { 364 if (value == ESCAPE) { 365 appendInt(buffer, ESCAPE); 366 } 367 appendInt(buffer, value); 368 --length; 369 } 370 appendInt(buffer, ESCAPE); 371 appendInt(buffer, length); 372 appendInt(buffer, value); // Don't need to escape this value 373 } 374 } 375 appendInt(T buffer, int value)376 private static final <T extends Appendable> void appendInt(T buffer, int value) { 377 try { 378 buffer.append((char)(value >>> 16)); 379 buffer.append((char)(value & 0xFFFF)); 380 } catch (IOException e) { 381 throw new IllegalIcuArgumentException(e); 382 } 383 } 384 385 /** 386 * Encode a run, possibly a degenerate run (of < 4 values). 387 * @param length The length of the run; must be > 0 && <= 0xFFFF. 388 */ encodeRun(T buffer, short value, int length)389 private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) { 390 try { 391 char valueChar = (char) value; 392 if (length < 4) { 393 for (int j=0; j<length; ++j) { 394 if (valueChar == ESCAPE) { 395 buffer.append(ESCAPE); 396 } 397 buffer.append(valueChar); 398 } 399 } 400 else { 401 if (length == ESCAPE) { 402 if (valueChar == ESCAPE) { 403 buffer.append(ESCAPE); 404 } 405 buffer.append(valueChar); 406 --length; 407 } 408 buffer.append(ESCAPE); 409 buffer.append((char) length); 410 buffer.append(valueChar); // Don't need to escape this value 411 } 412 } catch (IOException e) { 413 throw new IllegalIcuArgumentException(e); 414 } 415 } 416 417 /** 418 * Encode a run, possibly a degenerate run (of < 4 values). 419 * @param length The length of the run; must be > 0 && <= 0xFF. 420 */ encodeRun(T buffer, byte value, int length, byte[] state)421 private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length, 422 byte[] state) { 423 if (length < 4) { 424 for (int j=0; j<length; ++j) { 425 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 426 appendEncodedByte(buffer, value, state); 427 } 428 } 429 else { 430 if ((byte)length == ESCAPE_BYTE) { 431 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 432 appendEncodedByte(buffer, value, state); 433 --length; 434 } 435 appendEncodedByte(buffer, ESCAPE_BYTE, state); 436 appendEncodedByte(buffer, (byte)length, state); 437 appendEncodedByte(buffer, value, state); // Don't need to escape this value 438 } 439 } 440 441 /** 442 * Append a byte to the given Appendable, packing two bytes into each 443 * character. The state parameter maintains intermediary data between 444 * calls. 445 * @param state A two-element array, with state[0] == 0 if this is the 446 * first byte of a pair, or state[0] != 0 if this is the second byte 447 * of a pair, in which case state[1] is the first byte. 448 */ appendEncodedByte(T buffer, byte value, byte[] state)449 private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value, 450 byte[] state) { 451 try { 452 if (state[0] != 0) { 453 char c = (char) ((state[1] << 8) | ((value) & 0xFF)); 454 buffer.append(c); 455 state[0] = 0; 456 } 457 else { 458 state[0] = 1; 459 state[1] = value; 460 } 461 } catch (IOException e) { 462 throw new IllegalIcuArgumentException(e); 463 } 464 } 465 466 /** 467 * Construct an array of ints from a run-length encoded string. 468 */ RLEStringToIntArray(String s)469 static public final int[] RLEStringToIntArray(String s) { 470 int length = getInt(s, 0); 471 int[] array = new int[length]; 472 int ai = 0, i = 1; 473 474 int maxI = s.length() / 2; 475 while (ai < length && i < maxI) { 476 int c = getInt(s, i++); 477 478 if (c == ESCAPE) { 479 c = getInt(s, i++); 480 if (c == ESCAPE) { 481 array[ai++] = c; 482 } else { 483 int runLength = c; 484 int runValue = getInt(s, i++); 485 for (int j=0; j<runLength; ++j) { 486 array[ai++] = runValue; 487 } 488 } 489 } 490 else { 491 array[ai++] = c; 492 } 493 } 494 495 if (ai != length || i != maxI) { 496 throw new IllegalStateException("Bad run-length encoded int array"); 497 } 498 499 return array; 500 } getInt(String s, int i)501 static final int getInt(String s, int i) { 502 return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1); 503 } 504 505 /** 506 * Construct an array of shorts from a run-length encoded string. 507 */ RLEStringToShortArray(String s)508 static public final short[] RLEStringToShortArray(String s) { 509 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 510 short[] array = new short[length]; 511 int ai = 0; 512 for (int i=2; i<s.length(); ++i) { 513 char c = s.charAt(i); 514 if (c == ESCAPE) { 515 c = s.charAt(++i); 516 if (c == ESCAPE) { 517 array[ai++] = (short) c; 518 } else { 519 int runLength = c; 520 short runValue = (short) s.charAt(++i); 521 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 522 } 523 } 524 else { 525 array[ai++] = (short) c; 526 } 527 } 528 529 if (ai != length) 530 throw new IllegalStateException("Bad run-length encoded short array"); 531 532 return array; 533 } 534 535 /** 536 * Construct an array of shorts from a run-length encoded string. 537 */ RLEStringToCharArray(String s)538 static public final char[] RLEStringToCharArray(String s) { 539 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 540 char[] array = new char[length]; 541 int ai = 0; 542 for (int i=2; i<s.length(); ++i) { 543 char c = s.charAt(i); 544 if (c == ESCAPE) { 545 c = s.charAt(++i); 546 if (c == ESCAPE) { 547 array[ai++] = c; 548 } else { 549 int runLength = c; 550 char runValue = s.charAt(++i); 551 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 552 } 553 } 554 else { 555 array[ai++] = c; 556 } 557 } 558 559 if (ai != length) 560 throw new IllegalStateException("Bad run-length encoded short array"); 561 562 return array; 563 } 564 565 /** 566 * Construct an array of bytes from a run-length encoded string. 567 */ RLEStringToByteArray(String s)568 static public final byte[] RLEStringToByteArray(String s) { 569 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 570 byte[] array = new byte[length]; 571 boolean nextChar = true; 572 char c = 0; 573 int node = 0; 574 int runLength = 0; 575 int i = 2; 576 for (int ai=0; ai<length; ) { 577 // This part of the loop places the next byte into the local 578 // variable 'b' each time through the loop. It keeps the 579 // current character in 'c' and uses the boolean 'nextChar' 580 // to see if we've taken both bytes out of 'c' yet. 581 byte b; 582 if (nextChar) { 583 c = s.charAt(i++); 584 b = (byte) (c >> 8); 585 nextChar = false; 586 } 587 else { 588 b = (byte) (c & 0xFF); 589 nextChar = true; 590 } 591 592 // This part of the loop is a tiny state machine which handles 593 // the parsing of the run-length encoding. This would be simpler 594 // if we could look ahead, but we can't, so we use 'node' to 595 // move between three nodes in the state machine. 596 switch (node) { 597 case 0: 598 // Normal idle node 599 if (b == ESCAPE_BYTE) { 600 node = 1; 601 } 602 else { 603 array[ai++] = b; 604 } 605 break; 606 case 1: 607 // We have seen one ESCAPE_BYTE; we expect either a second 608 // one, or a run length and value. 609 if (b == ESCAPE_BYTE) { 610 array[ai++] = ESCAPE_BYTE; 611 node = 0; 612 } 613 else { 614 runLength = b; 615 // Interpret signed byte as unsigned 616 if (runLength < 0) runLength += 0x100; 617 node = 2; 618 } 619 break; 620 case 2: 621 // We have seen an ESCAPE_BYTE and length byte. We interpret 622 // the next byte as the value to be repeated. 623 for (int j=0; j<runLength; ++j) array[ai++] = b; 624 node = 0; 625 break; 626 } 627 } 628 629 if (node != 0) 630 throw new IllegalStateException("Bad run-length encoded byte array"); 631 632 if (i != s.length()) 633 throw new IllegalStateException("Excess data in RLE byte array string"); 634 635 return array; 636 } 637 638 static public String LINE_SEPARATOR = System.getProperty("line.separator"); 639 640 /** 641 * Format a String for representation in a source file. This includes 642 * breaking it into lines and escaping characters using octal notation 643 * when necessary (control characters and double quotes). 644 */ formatForSource(String s)645 static public final String formatForSource(String s) { 646 StringBuilder buffer = new StringBuilder(); 647 for (int i=0; i<s.length();) { 648 if (i > 0) buffer.append('+').append(LINE_SEPARATOR); 649 buffer.append(" \""); 650 int count = 11; 651 while (i<s.length() && count<80) { 652 char c = s.charAt(i++); 653 if (c < '\u0020' || c == '"' || c == '\\') { 654 if (c == '\n') { 655 buffer.append("\\n"); 656 count += 2; 657 } else if (c == '\t') { 658 buffer.append("\\t"); 659 count += 2; 660 } else if (c == '\r') { 661 buffer.append("\\r"); 662 count += 2; 663 } else { 664 // Represent control characters, backslash and double quote 665 // using octal notation; otherwise the string we form 666 // won't compile, since Unicode escape sequences are 667 // processed before tokenization. 668 buffer.append('\\'); 669 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 670 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 671 buffer.append(HEX_DIGIT[(c & 0007)]); 672 count += 4; 673 } 674 } 675 else if (c <= '\u007E') { 676 buffer.append(c); 677 count += 1; 678 } 679 else { 680 buffer.append("\\u"); 681 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 682 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 683 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 684 buffer.append(HEX_DIGIT[(c & 0x000F)]); 685 count += 6; 686 } 687 } 688 buffer.append('"'); 689 } 690 return buffer.toString(); 691 } 692 693 static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7', 694 '8','9','A','B','C','D','E','F'}; 695 696 /** 697 * Format a String for representation in a source file. Like 698 * formatForSource but does not do line breaking. 699 */ format1ForSource(String s)700 static public final String format1ForSource(String s) { 701 StringBuilder buffer = new StringBuilder(); 702 buffer.append("\""); 703 for (int i=0; i<s.length();) { 704 char c = s.charAt(i++); 705 if (c < '\u0020' || c == '"' || c == '\\') { 706 if (c == '\n') { 707 buffer.append("\\n"); 708 } else if (c == '\t') { 709 buffer.append("\\t"); 710 } else if (c == '\r') { 711 buffer.append("\\r"); 712 } else { 713 // Represent control characters, backslash and double quote 714 // using octal notation; otherwise the string we form 715 // won't compile, since Unicode escape sequences are 716 // processed before tokenization. 717 buffer.append('\\'); 718 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 719 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 720 buffer.append(HEX_DIGIT[(c & 0007)]); 721 } 722 } 723 else if (c <= '\u007E') { 724 buffer.append(c); 725 } 726 else { 727 buffer.append("\\u"); 728 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 729 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 730 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 731 buffer.append(HEX_DIGIT[(c & 0x000F)]); 732 } 733 } 734 buffer.append('"'); 735 return buffer.toString(); 736 } 737 738 /** 739 * Convert characters outside the range U+0020 to U+007F to 740 * Unicode escapes, and convert backslash to a double backslash. 741 */ escape(String s)742 public static final String escape(String s) { 743 StringBuilder buf = new StringBuilder(); 744 for (int i=0; i<s.length(); ) { 745 int c = Character.codePointAt(s, i); 746 i += UTF16.getCharCount(c); 747 if (c >= ' ' && c <= 0x007F) { 748 if (c == '\\') { 749 buf.append("\\\\"); // That is, "\\" 750 } else { 751 buf.append((char)c); 752 } 753 } else { 754 boolean four = c <= 0xFFFF; 755 buf.append(four ? "\\u" : "\\U"); 756 buf.append(hex(c, four ? 4 : 8)); 757 } 758 } 759 return buf.toString(); 760 } 761 762 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 763 static private final char[] UNESCAPE_MAP = { 764 /*" 0x22, 0x22 */ 765 /*' 0x27, 0x27 */ 766 /*? 0x3F, 0x3F */ 767 /*\ 0x5C, 0x5C */ 768 /*a*/ 0x61, 0x07, 769 /*b*/ 0x62, 0x08, 770 /*e*/ 0x65, 0x1b, 771 /*f*/ 0x66, 0x0c, 772 /*n*/ 0x6E, 0x0a, 773 /*r*/ 0x72, 0x0d, 774 /*t*/ 0x74, 0x09, 775 /*v*/ 0x76, 0x0b 776 }; 777 778 /** 779 * Convert an escape to a 32-bit code point value. We attempt 780 * to parallel the icu4c unescapeAt() function. 781 * @param offset16 an array containing offset to the character 782 * <em>after</em> the backslash. Upon return offset16[0] will 783 * be updated to point after the escape sequence. 784 * @return character value from 0 to 10FFFF, or -1 on error. 785 */ unescapeAt(String s, int[] offset16)786 public static int unescapeAt(String s, int[] offset16) { 787 int c; 788 int result = 0; 789 int n = 0; 790 int minDig = 0; 791 int maxDig = 0; 792 int bitsPerDigit = 4; 793 int dig; 794 int i; 795 boolean braces = false; 796 797 /* Check that offset is in range */ 798 int offset = offset16[0]; 799 int length = s.length(); 800 if (offset < 0 || offset >= length) { 801 return -1; 802 } 803 804 /* Fetch first UChar after '\\' */ 805 c = Character.codePointAt(s, offset); 806 offset += UTF16.getCharCount(c); 807 808 /* Convert hexadecimal and octal escapes */ 809 switch (c) { 810 case 'u': 811 minDig = maxDig = 4; 812 break; 813 case 'U': 814 minDig = maxDig = 8; 815 break; 816 case 'x': 817 minDig = 1; 818 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { 819 ++offset; 820 braces = true; 821 maxDig = 8; 822 } else { 823 maxDig = 2; 824 } 825 break; 826 default: 827 dig = UCharacter.digit(c, 8); 828 if (dig >= 0) { 829 minDig = 1; 830 maxDig = 3; 831 n = 1; /* Already have first octal digit */ 832 bitsPerDigit = 3; 833 result = dig; 834 } 835 break; 836 } 837 if (minDig != 0) { 838 while (offset < length && n < maxDig) { 839 c = UTF16.charAt(s, offset); 840 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); 841 if (dig < 0) { 842 break; 843 } 844 result = (result << bitsPerDigit) | dig; 845 offset += UTF16.getCharCount(c); 846 ++n; 847 } 848 if (n < minDig) { 849 return -1; 850 } 851 if (braces) { 852 if (c != 0x7D /*}*/) { 853 return -1; 854 } 855 ++offset; 856 } 857 if (result < 0 || result >= 0x110000) { 858 return -1; 859 } 860 // If an escape sequence specifies a lead surrogate, see 861 // if there is a trail surrogate after it, either as an 862 // escape or as a literal. If so, join them up into a 863 // supplementary. 864 if (offset < length && 865 UTF16.isLeadSurrogate((char) result)) { 866 int ahead = offset+1; 867 c = s.charAt(offset); // [sic] get 16-bit code unit 868 if (c == '\\' && ahead < length) { 869 int o[] = new int[] { ahead }; 870 c = unescapeAt(s, o); 871 ahead = o[0]; 872 } 873 if (UTF16.isTrailSurrogate((char) c)) { 874 offset = ahead; 875 result = Character.toCodePoint((char) result, (char) c); 876 } 877 } 878 offset16[0] = offset; 879 return result; 880 } 881 882 /* Convert C-style escapes in table */ 883 for (i=0; i<UNESCAPE_MAP.length; i+=2) { 884 if (c == UNESCAPE_MAP[i]) { 885 offset16[0] = offset; 886 return UNESCAPE_MAP[i+1]; 887 } else if (c < UNESCAPE_MAP[i]) { 888 break; 889 } 890 } 891 892 /* Map \cX to control-X: X & 0x1F */ 893 if (c == 'c' && offset < length) { 894 c = UTF16.charAt(s, offset); 895 offset16[0] = offset + UTF16.getCharCount(c); 896 return 0x1F & c; 897 } 898 899 /* If no special forms are recognized, then consider 900 * the backslash to generically escape the next character. */ 901 offset16[0] = offset; 902 return c; 903 } 904 905 /** 906 * Convert all escapes in a given string using unescapeAt(). 907 * @exception IllegalArgumentException if an invalid escape is 908 * seen. 909 */ unescape(String s)910 public static String unescape(String s) { 911 StringBuilder buf = new StringBuilder(); 912 int[] pos = new int[1]; 913 for (int i=0; i<s.length(); ) { 914 char c = s.charAt(i++); 915 if (c == '\\') { 916 pos[0] = i; 917 int e = unescapeAt(s, pos); 918 if (e < 0) { 919 throw new IllegalArgumentException("Invalid escape sequence " + 920 s.substring(i-1, Math.min(i+8, s.length()))); 921 } 922 buf.appendCodePoint(e); 923 i = pos[0]; 924 } else { 925 buf.append(c); 926 } 927 } 928 return buf.toString(); 929 } 930 931 /** 932 * Convert all escapes in a given string using unescapeAt(). 933 * Leave invalid escape sequences unchanged. 934 */ unescapeLeniently(String s)935 public static String unescapeLeniently(String s) { 936 StringBuilder buf = new StringBuilder(); 937 int[] pos = new int[1]; 938 for (int i=0; i<s.length(); ) { 939 char c = s.charAt(i++); 940 if (c == '\\') { 941 pos[0] = i; 942 int e = unescapeAt(s, pos); 943 if (e < 0) { 944 buf.append(c); 945 } else { 946 buf.appendCodePoint(e); 947 i = pos[0]; 948 } 949 } else { 950 buf.append(c); 951 } 952 } 953 return buf.toString(); 954 } 955 956 /** 957 * Convert a char to 4 hex uppercase digits. E.g., hex('a') => 958 * "0041". 959 */ hex(long ch)960 public static String hex(long ch) { 961 return hex(ch, 4); 962 } 963 964 /** 965 * Supplies a zero-padded hex representation of an integer (without 0x) 966 */ hex(long i, int places)967 static public String hex(long i, int places) { 968 if (i == Long.MIN_VALUE) return "-8000000000000000"; 969 boolean negative = i < 0; 970 if (negative) { 971 i = -i; 972 } 973 String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); 974 if (result.length() < places) { 975 result = "0000000000000000".substring(result.length(),places) + result; 976 } 977 if (negative) { 978 return '-' + result; 979 } 980 return result; 981 } 982 983 /** 984 * Convert a string to comma-separated groups of 4 hex uppercase 985 * digits. E.g., hex('ab') => "0041,0042". 986 */ 987 public static String hex(CharSequence s) { 988 return hex(s, 4, ",", true, new StringBuilder()).toString(); 989 } 990 991 /** 992 * Convert a string to separated groups of hex uppercase 993 * digits. E.g., hex('ab'...) => "0041,0042". Append the output 994 * to the given Appendable. 995 */ 996 public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) { 997 try { 998 if (useCodePoints) { 999 int cp; 1000 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1001 cp = Character.codePointAt(s, i); 1002 if (i != 0) { 1003 result.append(separator); 1004 } 1005 result.append(hex(cp,width)); 1006 } 1007 } else { 1008 for (int i = 0; i < s.length(); ++i) { 1009 if (i != 0) { 1010 result.append(separator); 1011 } 1012 result.append(hex(s.charAt(i),width)); 1013 } 1014 } 1015 return result; 1016 } catch (IOException e) { 1017 throw new IllegalIcuArgumentException(e); 1018 } 1019 } 1020 1021 public static String hex(byte[] o, int start, int end, String separator) { 1022 StringBuilder result = new StringBuilder(); 1023 //int ch; 1024 for (int i = start; i < end; ++i) { 1025 if (i != 0) result.append(separator); 1026 result.append(hex(o[i])); 1027 } 1028 return result.toString(); 1029 } 1030 1031 /** 1032 * Convert a string to comma-separated groups of 4 hex uppercase 1033 * digits. E.g., hex('ab') => "0041,0042". 1034 */ 1035 public static <S extends CharSequence> String hex(S s, int width, S separator) { 1036 return hex(s, width, separator, true, new StringBuilder()).toString(); 1037 } 1038 1039 /** 1040 * Split a string into pieces based on the given divider character 1041 * @param s the string to split 1042 * @param divider the character on which to split. Occurrences of 1043 * this character are not included in the output 1044 * @param output an array to receive the substrings between 1045 * instances of divider. It must be large enough on entry to 1046 * accomodate all output. Adjacent instances of the divider 1047 * character will place empty strings into output. Before 1048 * returning, output is padded out with empty strings. 1049 */ 1050 public static void split(String s, char divider, String[] output) { 1051 int last = 0; 1052 int current = 0; 1053 int i; 1054 for (i = 0; i < s.length(); ++i) { 1055 if (s.charAt(i) == divider) { 1056 output[current++] = s.substring(last,i); 1057 last = i+1; 1058 } 1059 } 1060 output[current++] = s.substring(last,i); 1061 while (current < output.length) { 1062 output[current++] = ""; 1063 } 1064 } 1065 1066 /** 1067 * Split a string into pieces based on the given divider character 1068 * @param s the string to split 1069 * @param divider the character on which to split. Occurrences of 1070 * this character are not included in the output 1071 * @return output an array to receive the substrings between 1072 * instances of divider. Adjacent instances of the divider 1073 * character will place empty strings into output. 1074 */ 1075 public static String[] split(String s, char divider) { 1076 int last = 0; 1077 int i; 1078 ArrayList<String> output = new ArrayList<>(); 1079 for (i = 0; i < s.length(); ++i) { 1080 if (s.charAt(i) == divider) { 1081 output.add(s.substring(last,i)); 1082 last = i+1; 1083 } 1084 } 1085 output.add( s.substring(last,i)); 1086 return output.toArray(new String[output.size()]); 1087 } 1088 1089 /** 1090 * Look up a given string in a string array. Returns the index at 1091 * which the first occurrence of the string was found in the 1092 * array, or -1 if it was not found. 1093 * @param source the string to search for 1094 * @param target the array of zero or more strings in which to 1095 * look for source 1096 * @return the index of target at which source first occurs, or -1 1097 * if not found 1098 */ 1099 public static int lookup(String source, String[] target) { 1100 for (int i = 0; i < target.length; ++i) { 1101 if (source.equals(target[i])) return i; 1102 } 1103 return -1; 1104 } 1105 1106 /** 1107 * Parse a single non-whitespace character 'ch', optionally 1108 * preceded by whitespace. 1109 * @param id the string to be parsed 1110 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 1111 * offset of the first character to be parsed. On output, pos[0] 1112 * is the index after the last parsed character. If the parse 1113 * fails, pos[0] will be unchanged. 1114 * @param ch the non-whitespace character to be parsed. 1115 * @return true if 'ch' is seen preceded by zero or more 1116 * whitespace characters. 1117 */ 1118 public static boolean parseChar(String id, int[] pos, char ch) { 1119 int start = pos[0]; 1120 pos[0] = PatternProps.skipWhiteSpace(id, pos[0]); 1121 if (pos[0] == id.length() || 1122 id.charAt(pos[0]) != ch) { 1123 pos[0] = start; 1124 return false; 1125 } 1126 ++pos[0]; 1127 return true; 1128 } 1129 1130 /** 1131 * Parse a pattern string starting at offset pos. Keywords are 1132 * matched case-insensitively. Spaces may be skipped and may be 1133 * optional or required. Integer values may be parsed, and if 1134 * they are, they will be returned in the given array. If 1135 * successful, the offset of the next non-space character is 1136 * returned. On failure, -1 is returned. 1137 * @param pattern must only contain lowercase characters, which 1138 * will match their uppercase equivalents as well. A space 1139 * character matches one or more required spaces. A '~' character 1140 * matches zero or more optional spaces. A '#' character matches 1141 * an integer and stores it in parsedInts, which the caller must 1142 * ensure has enough capacity. 1143 * @param parsedInts array to receive parsed integers. Caller 1144 * must ensure that parsedInts.length is >= the number of '#' 1145 * signs in 'pattern'. 1146 * @return the position after the last character parsed, or -1 if 1147 * the parse failed 1148 */ 1149 @SuppressWarnings("fallthrough") 1150 public static int parsePattern(String rule, int pos, int limit, 1151 String pattern, int[] parsedInts) { 1152 // TODO Update this to handle surrogates 1153 int[] p = new int[1]; 1154 int intCount = 0; // number of integers parsed 1155 for (int i=0; i<pattern.length(); ++i) { 1156 char cpat = pattern.charAt(i); 1157 char c; 1158 switch (cpat) { 1159 case ' ': 1160 if (pos >= limit) { 1161 return -1; 1162 } 1163 c = rule.charAt(pos++); 1164 if (!PatternProps.isWhiteSpace(c)) { 1165 return -1; 1166 } 1167 // FALL THROUGH to skipWhitespace 1168 case '~': 1169 pos = PatternProps.skipWhiteSpace(rule, pos); 1170 break; 1171 case '#': 1172 p[0] = pos; 1173 parsedInts[intCount++] = parseInteger(rule, p, limit); 1174 if (p[0] == pos) { 1175 // Syntax error; failed to parse integer 1176 return -1; 1177 } 1178 pos = p[0]; 1179 break; 1180 default: 1181 if (pos >= limit) { 1182 return -1; 1183 } 1184 c = (char) UCharacter.toLowerCase(rule.charAt(pos++)); 1185 if (c != cpat) { 1186 return -1; 1187 } 1188 break; 1189 } 1190 } 1191 return pos; 1192 } 1193 1194 /** 1195 * Parse a pattern string within the given Replaceable and a parsing 1196 * pattern. Characters are matched literally and case-sensitively 1197 * except for the following special characters: 1198 * 1199 * ~ zero or more Pattern_White_Space chars 1200 * 1201 * If end of pattern is reached with all matches along the way, 1202 * pos is advanced to the first unparsed index and returned. 1203 * Otherwise -1 is returned. 1204 * @param pat pattern that controls parsing 1205 * @param text text to be parsed, starting at index 1206 * @param index offset to first character to parse 1207 * @param limit offset after last character to parse 1208 * @return index after last parsed character, or -1 on parse failure. 1209 */ 1210 public static int parsePattern(String pat, 1211 Replaceable text, 1212 int index, 1213 int limit) { 1214 int ipat = 0; 1215 1216 // empty pattern matches immediately 1217 if (ipat == pat.length()) { 1218 return index; 1219 } 1220 1221 int cpat = Character.codePointAt(pat, ipat); 1222 1223 while (index < limit) { 1224 int c = text.char32At(index); 1225 1226 // parse \s* 1227 if (cpat == '~') { 1228 if (PatternProps.isWhiteSpace(c)) { 1229 index += UTF16.getCharCount(c); 1230 continue; 1231 } else { 1232 if (++ipat == pat.length()) { 1233 return index; // success; c unparsed 1234 } 1235 // fall thru; process c again with next cpat 1236 } 1237 } 1238 1239 // parse literal 1240 else if (c == cpat) { 1241 int n = UTF16.getCharCount(c); 1242 index += n; 1243 ipat += n; 1244 if (ipat == pat.length()) { 1245 return index; // success; c parsed 1246 } 1247 // fall thru; get next cpat 1248 } 1249 1250 // match failure of literal 1251 else { 1252 return -1; 1253 } 1254 1255 cpat = UTF16.charAt(pat, ipat); 1256 } 1257 1258 return -1; // text ended before end of pat 1259 } 1260 1261 /** 1262 * Parse an integer at pos, either of the form \d+ or of the form 1263 * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex, 1264 * or octal format. 1265 * @param pos INPUT-OUTPUT parameter. On input, the first 1266 * character to parse. On output, the character after the last 1267 * parsed character. 1268 */ 1269 public static int parseInteger(String rule, int[] pos, int limit) { 1270 int count = 0; 1271 int value = 0; 1272 int p = pos[0]; 1273 int radix = 10; 1274 1275 if (rule.regionMatches(true, p, "0x", 0, 2)) { 1276 p += 2; 1277 radix = 16; 1278 } else if (p < limit && rule.charAt(p) == '0') { 1279 p++; 1280 count = 1; 1281 radix = 8; 1282 } 1283 1284 while (p < limit) { 1285 int d = UCharacter.digit(rule.charAt(p++), radix); 1286 if (d < 0) { 1287 --p; 1288 break; 1289 } 1290 ++count; 1291 int v = (value * radix) + d; 1292 if (v <= value) { 1293 // If there are too many input digits, at some point 1294 // the value will go negative, e.g., if we have seen 1295 // "0x8000000" already and there is another '0', when 1296 // we parse the next 0 the value will go negative. 1297 return 0; 1298 } 1299 value = v; 1300 } 1301 if (count > 0) { 1302 pos[0] = p; 1303 } 1304 return value; 1305 } 1306 1307 /** 1308 * Parse a Unicode identifier from the given string at the given 1309 * position. Return the identifier, or null if there is no 1310 * identifier. 1311 * @param str the string to parse 1312 * @param pos INPUT-OUPUT parameter. On INPUT, pos[0] is the 1313 * first character to examine. It must be less than str.length(), 1314 * and it must not point to a whitespace character. That is, must 1315 * have pos[0] < str.length(). On 1316 * OUTPUT, the position after the last parsed character. 1317 * @return the Unicode identifier, or null if there is no valid 1318 * identifier at pos[0]. 1319 */ 1320 public static String parseUnicodeIdentifier(String str, int[] pos) { 1321 // assert(pos[0] < str.length()); 1322 StringBuilder buf = new StringBuilder(); 1323 int p = pos[0]; 1324 while (p < str.length()) { 1325 int ch = Character.codePointAt(str, p); 1326 if (buf.length() == 0) { 1327 if (UCharacter.isUnicodeIdentifierStart(ch)) { 1328 buf.appendCodePoint(ch); 1329 } else { 1330 return null; 1331 } 1332 } else { 1333 if (UCharacter.isUnicodeIdentifierPart(ch)) { 1334 buf.appendCodePoint(ch); 1335 } else { 1336 break; 1337 } 1338 } 1339 p += UTF16.getCharCount(ch); 1340 } 1341 pos[0] = p; 1342 return buf.toString(); 1343 } 1344 1345 static final char DIGITS[] = { 1346 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 1347 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 1348 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 1349 'U', 'V', 'W', 'X', 'Y', 'Z' 1350 }; 1351 1352 /** 1353 * Append the digits of a positive integer to the given 1354 * <code>Appendable</code> in the given radix. This is 1355 * done recursively since it is easiest to generate the low- 1356 * order digit first, but it must be appended last. 1357 * 1358 * @param result is the <code>Appendable</code> to append to 1359 * @param n is the positive integer 1360 * @param radix is the radix, from 2 to 36 inclusive 1361 * @param minDigits is the minimum number of digits to append. 1362 */ 1363 private static <T extends Appendable> void recursiveAppendNumber(T result, int n, 1364 int radix, int minDigits) 1365 { 1366 try { 1367 int digit = n % radix; 1368 1369 if (n >= radix || minDigits > 1) { 1370 recursiveAppendNumber(result, n / radix, radix, minDigits - 1); 1371 } 1372 result.append(DIGITS[digit]); 1373 } catch (IOException e) { 1374 throw new IllegalIcuArgumentException(e); 1375 } 1376 } 1377 1378 /** 1379 * Append a number to the given Appendable in the given radix. 1380 * Standard digits '0'-'9' are used and letters 'A'-'Z' for 1381 * radices 11 through 36. 1382 * @param result the digits of the number are appended here 1383 * @param n the number to be converted to digits; may be negative. 1384 * If negative, a '-' is prepended to the digits. 1385 * @param radix a radix from 2 to 36 inclusive. 1386 * @param minDigits the minimum number of digits, not including 1387 * any '-', to produce. Values less than 2 have no effect. One 1388 * digit is always emitted regardless of this parameter. 1389 * @return a reference to result 1390 */ 1391 public static <T extends Appendable> T appendNumber(T result, int n, 1392 int radix, int minDigits) 1393 { 1394 try { 1395 if (radix < 2 || radix > 36) { 1396 throw new IllegalArgumentException("Illegal radix " + radix); 1397 } 1398 1399 1400 int abs = n; 1401 1402 if (n < 0) { 1403 abs = -n; 1404 result.append("-"); 1405 } 1406 1407 recursiveAppendNumber(result, abs, radix, minDigits); 1408 1409 return result; 1410 } catch (IOException e) { 1411 throw new IllegalIcuArgumentException(e); 1412 } 1413 1414 } 1415 1416 /** 1417 * Parse an unsigned 31-bit integer at the given offset. Use 1418 * UCharacter.digit() to parse individual characters into digits. 1419 * @param text the text to be parsed 1420 * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the 1421 * offset within text at which to start parsing; it should point 1422 * to a valid digit. On exit, pos[0] is the offset after the last 1423 * parsed character. If the parse failed, it will be unchanged on 1424 * exit. Must be >= 0 on entry. 1425 * @param radix the radix in which to parse; must be >= 2 and <= 1426 * 36. 1427 * @return a non-negative parsed number, or -1 upon parse failure. 1428 * Parse fails if there are no digits, that is, if pos[0] does not 1429 * point to a valid digit on entry, or if the number to be parsed 1430 * does not fit into a 31-bit unsigned integer. 1431 */ 1432 public static int parseNumber(String text, int[] pos, int radix) { 1433 // assert(pos[0] >= 0); 1434 // assert(radix >= 2); 1435 // assert(radix <= 36); 1436 int n = 0; 1437 int p = pos[0]; 1438 while (p < text.length()) { 1439 int ch = Character.codePointAt(text, p); 1440 int d = UCharacter.digit(ch, radix); 1441 if (d < 0) { 1442 break; 1443 } 1444 n = radix*n + d; 1445 // ASSUME that when a 32-bit integer overflows it becomes 1446 // negative. E.g., 214748364 * 10 + 8 => negative value. 1447 if (n < 0) { 1448 return -1; 1449 } 1450 ++p; 1451 } 1452 if (p == pos[0]) { 1453 return -1; 1454 } 1455 pos[0] = p; 1456 return n; 1457 } 1458 1459 /** 1460 * Return true if the character is NOT printable ASCII. The tab, 1461 * newline and linefeed characters are considered unprintable. 1462 */ 1463 public static boolean isUnprintable(int c) { 1464 //0x20 = 32 and 0x7E = 126 1465 return !(c >= 0x20 && c <= 0x7E); 1466 } 1467 1468 /** 1469 * Escape unprintable characters using <backslash>uxxxx notation 1470 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and 1471 * above. If the character is printable ASCII, then do nothing 1472 * and return FALSE. Otherwise, append the escaped notation and 1473 * return TRUE. 1474 */ 1475 public static <T extends Appendable> boolean escapeUnprintable(T result, int c) { 1476 try { 1477 if (isUnprintable(c)) { 1478 result.append('\\'); 1479 if ((c & ~0xFFFF) != 0) { 1480 result.append('U'); 1481 result.append(DIGITS[0xF&(c>>28)]); 1482 result.append(DIGITS[0xF&(c>>24)]); 1483 result.append(DIGITS[0xF&(c>>20)]); 1484 result.append(DIGITS[0xF&(c>>16)]); 1485 } else { 1486 result.append('u'); 1487 } 1488 result.append(DIGITS[0xF&(c>>12)]); 1489 result.append(DIGITS[0xF&(c>>8)]); 1490 result.append(DIGITS[0xF&(c>>4)]); 1491 result.append(DIGITS[0xF&c]); 1492 return true; 1493 } 1494 return false; 1495 } catch (IOException e) { 1496 throw new IllegalIcuArgumentException(e); 1497 } 1498 } 1499 1500 /** 1501 * Returns the index of the first character in a set, ignoring quoted text. 1502 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be 1503 * found by a search for "h". Unlike String.indexOf(), this method searches 1504 * not for a single character, but for any character of the string 1505 * <code>setOfChars</code>. 1506 * @param text text to be searched 1507 * @param start the beginning index, inclusive; <code>0 <= start 1508 * <= limit</code>. 1509 * @param limit the ending index, exclusive; <code>start <= limit 1510 * <= text.length()</code>. 1511 * @param setOfChars string with one or more distinct characters 1512 * @return Offset of the first character in <code>setOfChars</code> 1513 * found, or -1 if not found. 1514 * @see String#indexOf 1515 */ 1516 public static int quotedIndexOf(String text, int start, int limit, 1517 String setOfChars) { 1518 for (int i=start; i<limit; ++i) { 1519 char c = text.charAt(i); 1520 if (c == BACKSLASH) { 1521 ++i; 1522 } else if (c == APOSTROPHE) { 1523 while (++i < limit 1524 && text.charAt(i) != APOSTROPHE) {} 1525 } else if (setOfChars.indexOf(c) >= 0) { 1526 return i; 1527 } 1528 } 1529 return -1; 1530 } 1531 1532 /** 1533 * Append a character to a rule that is being built up. To flush 1534 * the quoteBuf to rule, make one final call with isLiteral == true. 1535 * If there is no final character, pass in (int)-1 as c. 1536 * @param rule the string to append the character to 1537 * @param c the character to append, or (int)-1 if none. 1538 * @param isLiteral if true, then the given character should not be 1539 * quoted or escaped. Usually this means it is a syntactic element 1540 * such as > or $ 1541 * @param escapeUnprintable if true, then unprintable characters 1542 * should be escaped using escapeUnprintable(). These escapes will 1543 * appear outside of quotes. 1544 * @param quoteBuf a buffer which is used to build up quoted 1545 * substrings. The caller should initially supply an empty buffer, 1546 * and thereafter should not modify the buffer. The buffer should be 1547 * cleared out by, at the end, calling this method with a literal 1548 * character (which may be -1). 1549 */ 1550 public static void appendToRule(StringBuffer rule, 1551 int c, 1552 boolean isLiteral, 1553 boolean escapeUnprintable, 1554 StringBuffer quoteBuf) { 1555 // If we are escaping unprintables, then escape them outside 1556 // quotes. \\u and \\U are not recognized within quotes. The same 1557 // logic applies to literals, but literals are never escaped. 1558 if (isLiteral || 1559 (escapeUnprintable && Utility.isUnprintable(c))) { 1560 if (quoteBuf.length() > 0) { 1561 // We prefer backslash APOSTROPHE to double APOSTROPHE 1562 // (more readable, less similar to ") so if there are 1563 // double APOSTROPHEs at the ends, we pull them outside 1564 // of the quote. 1565 1566 // If the first thing in the quoteBuf is APOSTROPHE 1567 // (doubled) then pull it out. 1568 while (quoteBuf.length() >= 2 && 1569 quoteBuf.charAt(0) == APOSTROPHE && 1570 quoteBuf.charAt(1) == APOSTROPHE) { 1571 rule.append(BACKSLASH).append(APOSTROPHE); 1572 quoteBuf.delete(0, 2); 1573 } 1574 // If the last thing in the quoteBuf is APOSTROPHE 1575 // (doubled) then remove and count it and add it after. 1576 int trailingCount = 0; 1577 while (quoteBuf.length() >= 2 && 1578 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE && 1579 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) { 1580 quoteBuf.setLength(quoteBuf.length()-2); 1581 ++trailingCount; 1582 } 1583 if (quoteBuf.length() > 0) { 1584 rule.append(APOSTROPHE); 1585 rule.append(quoteBuf); 1586 rule.append(APOSTROPHE); 1587 quoteBuf.setLength(0); 1588 } 1589 while (trailingCount-- > 0) { 1590 rule.append(BACKSLASH).append(APOSTROPHE); 1591 } 1592 } 1593 if (c != -1) { 1594 /* Since spaces are ignored during parsing, they are 1595 * emitted only for readability. We emit one here 1596 * only if there isn't already one at the end of the 1597 * rule. 1598 */ 1599 if (c == ' ') { 1600 int len = rule.length(); 1601 if (len > 0 && rule.charAt(len-1) != ' ') { 1602 rule.append(' '); 1603 } 1604 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) { 1605 rule.appendCodePoint(c); 1606 } 1607 } 1608 } 1609 1610 // Escape ' and '\' and don't begin a quote just for them 1611 else if (quoteBuf.length() == 0 && 1612 (c == APOSTROPHE || c == BACKSLASH)) { 1613 rule.append(BACKSLASH).append((char)c); 1614 } 1615 1616 // Specials (printable ascii that isn't [0-9a-zA-Z]) and 1617 // whitespace need quoting. Also append stuff to quotes if we are 1618 // building up a quoted substring already. 1619 else if (quoteBuf.length() > 0 || 1620 (c >= 0x0021 && c <= 0x007E && 1621 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 1622 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 1623 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || 1624 PatternProps.isWhiteSpace(c)) { 1625 quoteBuf.appendCodePoint(c); 1626 // Double ' within a quote 1627 if (c == APOSTROPHE) { 1628 quoteBuf.append((char)c); 1629 } 1630 } 1631 1632 // Otherwise just append 1633 else { 1634 rule.appendCodePoint(c); 1635 } 1636 } 1637 1638 /** 1639 * Append the given string to the rule. Calls the single-character 1640 * version of appendToRule for each character. 1641 */ 1642 public static void appendToRule(StringBuffer rule, 1643 String text, 1644 boolean isLiteral, 1645 boolean escapeUnprintable, 1646 StringBuffer quoteBuf) { 1647 for (int i=0; i<text.length(); ++i) { 1648 // Okay to process in 16-bit code units here 1649 appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf); 1650 } 1651 } 1652 1653 /** 1654 * Given a matcher reference, which may be null, append its 1655 * pattern as a literal to the given rule. 1656 */ 1657 public static void appendToRule(StringBuffer rule, 1658 UnicodeMatcher matcher, 1659 boolean escapeUnprintable, 1660 StringBuffer quoteBuf) { 1661 if (matcher != null) { 1662 appendToRule(rule, matcher.toPattern(escapeUnprintable), 1663 true, escapeUnprintable, quoteBuf); 1664 } 1665 } 1666 1667 /** 1668 * Compares 2 unsigned integers 1669 * @param source 32 bit unsigned integer 1670 * @param target 32 bit unsigned integer 1671 * @return 0 if equals, 1 if source is greater than target and -1 1672 * otherwise 1673 */ 1674 public static final int compareUnsigned(int source, int target) 1675 { 1676 source += MAGIC_UNSIGNED; 1677 target += MAGIC_UNSIGNED; 1678 if (source < target) { 1679 return -1; 1680 } 1681 else if (source > target) { 1682 return 1; 1683 } 1684 return 0; 1685 } 1686 1687 /** 1688 * Find the highest bit in a positive integer. This is done 1689 * by doing a binary search through the bits. 1690 * 1691 * @param n is the integer 1692 * 1693 * @return the bit number of the highest bit, with 0 being 1694 * the low order bit, or -1 if <code>n</code> is not positive 1695 */ 1696 public static final byte highBit(int n) 1697 { 1698 if (n <= 0) { 1699 return -1; 1700 } 1701 1702 byte bit = 0; 1703 1704 if (n >= 1 << 16) { 1705 n >>= 16; 1706 bit += 16; 1707 } 1708 1709 if (n >= 1 << 8) { 1710 n >>= 8; 1711 bit += 8; 1712 } 1713 1714 if (n >= 1 << 4) { 1715 n >>= 4; 1716 bit += 4; 1717 } 1718 1719 if (n >= 1 << 2) { 1720 n >>= 2; 1721 bit += 2; 1722 } 1723 1724 if (n >= 1 << 1) { 1725 n >>= 1; 1726 bit += 1; 1727 } 1728 1729 return bit; 1730 } 1731 /** 1732 * Utility method to take a int[] containing codepoints and return 1733 * a string representation with code units. 1734 */ 1735 public static String valueOf(int[]source){ 1736 // TODO: Investigate why this method is not on UTF16 class 1737 StringBuilder result = new StringBuilder(source.length); 1738 for(int i=0; i<source.length; i++){ 1739 result.appendCodePoint(source[i]); 1740 } 1741 return result.toString(); 1742 } 1743 1744 1745 /** 1746 * Utility to duplicate a string count times 1747 * @param s String to be duplicated. 1748 * @param count Number of times to duplicate a string. 1749 */ 1750 public static String repeat(String s, int count) { 1751 if (count <= 0) return ""; 1752 if (count == 1) return s; 1753 StringBuilder result = new StringBuilder(); 1754 for (int i = 0; i < count; ++i) { 1755 result.append(s); 1756 } 1757 return result.toString(); 1758 } 1759 1760 public static String[] splitString(String src, String target) { 1761 return src.split("\\Q" + target + "\\E"); 1762 } 1763 1764 /** 1765 * Split the string at runs of ascii whitespace characters. 1766 */ 1767 public static String[] splitWhitespace(String src) { 1768 return src.split("\\s+"); 1769 } 1770 1771 /** 1772 * Parse a list of hex numbers and return a string 1773 * @param string String of hex numbers. 1774 * @param minLength Minimal length. 1775 * @param separator Separator. 1776 * @return A string from hex numbers. 1777 */ 1778 public static String fromHex(String string, int minLength, String separator) { 1779 return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+")); 1780 } 1781 1782 /** 1783 * Parse a list of hex numbers and return a string 1784 * @param string String of hex numbers. 1785 * @param minLength Minimal length. 1786 * @param separator Separator. 1787 * @return A string from hex numbers. 1788 */ 1789 public static String fromHex(String string, int minLength, Pattern separator) { 1790 StringBuilder buffer = new StringBuilder(); 1791 String[] parts = separator.split(string); 1792 for (String part : parts) { 1793 if (part.length() < minLength) { 1794 throw new IllegalArgumentException("code point too short: " + part); 1795 } 1796 int cp = Integer.parseInt(part, 16); 1797 buffer.appendCodePoint(cp); 1798 } 1799 return buffer.toString(); 1800 } 1801 1802 /** 1803 * This implementation is equivalent to Java 8+ Math#addExact(int, int) 1804 * @param x the first value 1805 * @param y the second value 1806 * @return the result 1807 */ 1808 public static int addExact(int x, int y) { 1809 int r = x + y; 1810 // HD 2-12 Overflow iff both arguments have the opposite sign of the result 1811 if (((x ^ r) & (y ^ r)) < 0) { 1812 throw new ArithmeticException("integer overflow"); 1813 } 1814 return r; 1815 } 1816 1817 /** 1818 * Returns whether the chars in the two CharSequences are equal. 1819 */ 1820 public static boolean charSequenceEquals(CharSequence a, CharSequence b) { 1821 if (a == b) { 1822 return true; 1823 } 1824 if (a == null || b == null) { 1825 return false; 1826 } 1827 if (a.length() != b.length()) { 1828 return false; 1829 } 1830 for (int i = 0; i < a.length(); i++) { 1831 if (a.charAt(i) != b.charAt(i)) 1832 return false; 1833 } 1834 return true; 1835 } 1836 1837 /** 1838 * Returns a hash code for a CharSequence that is equivalent to calling 1839 * charSequence.toString().hashCode() 1840 */ 1841 public static int charSequenceHashCode(CharSequence value) { 1842 int hash = 0; 1843 for (int i = 0; i < value.length(); i++) { 1844 hash = hash * 31 + value.charAt(i); 1845 } 1846 return hash; 1847 } 1848 } 1849