1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import java.io.IOException; 12 import java.util.ArrayList; 13 import java.util.Locale; 14 import java.util.regex.Pattern; 15 16 import com.ibm.icu.lang.UCharacter; 17 import com.ibm.icu.text.Replaceable; 18 import com.ibm.icu.text.UTF16; 19 import com.ibm.icu.text.UnicodeMatcher; 20 import com.ibm.icu.util.ICUUncheckedIOException; 21 22 public final class Utility { 23 24 private static final char APOSTROPHE = '\''; 25 private static final char BACKSLASH = '\\'; 26 private static final int MAGIC_UNSIGNED = 0x80000000; 27 28 /** 29 * Convenience utility to compare two Object[]s. 30 * Ought to be in System 31 */ arrayEquals(Object[] source, Object target)32 public final static boolean arrayEquals(Object[] source, Object target) { 33 if (source == null) return (target == null); 34 if (!(target instanceof Object[])) return false; 35 Object[] targ = (Object[]) target; 36 return (source.length == targ.length 37 && arrayRegionMatches(source, 0, targ, 0, source.length)); 38 } 39 40 /** 41 * Convenience utility to compare two int[]s 42 * Ought to be in System 43 */ arrayEquals(int[] source, Object target)44 public final static boolean arrayEquals(int[] source, Object target) { 45 if (source == null) return (target == null); 46 if (!(target instanceof int[])) return false; 47 int[] targ = (int[]) target; 48 return (source.length == targ.length 49 && arrayRegionMatches(source, 0, targ, 0, source.length)); 50 } 51 52 /** 53 * Convenience utility to compare two double[]s 54 * Ought to be in System 55 */ arrayEquals(double[] source, Object target)56 public final static boolean arrayEquals(double[] source, Object target) { 57 if (source == null) return (target == null); 58 if (!(target instanceof double[])) return false; 59 double[] targ = (double[]) target; 60 return (source.length == targ.length 61 && arrayRegionMatches(source, 0, targ, 0, source.length)); 62 } arrayEquals(byte[] source, Object target)63 public final static boolean arrayEquals(byte[] source, Object target) { 64 if (source == null) return (target == null); 65 if (!(target instanceof byte[])) return false; 66 byte[] targ = (byte[]) target; 67 return (source.length == targ.length 68 && arrayRegionMatches(source, 0, targ, 0, source.length)); 69 } 70 71 /** 72 * Convenience utility to compare two Object[]s 73 * Ought to be in System 74 */ arrayEquals(Object source, Object target)75 public final static boolean arrayEquals(Object source, Object target) { 76 if (source == null) return (target == null); 77 // for some reason, the correct arrayEquals is not being called 78 // so do it by hand for now. 79 if (source instanceof Object[]) 80 return(arrayEquals((Object[]) source,target)); 81 if (source instanceof int[]) 82 return(arrayEquals((int[]) source,target)); 83 if (source instanceof double[]) 84 return(arrayEquals((double[]) source, target)); 85 if (source instanceof byte[]) 86 return(arrayEquals((byte[]) source,target)); 87 return source.equals(target); 88 } 89 90 /** 91 * Convenience utility to compare two Object[]s 92 * Ought to be in System. 93 * @param len the length to compare. 94 * The start indices and start+len must be valid. 95 */ arrayRegionMatches(Object[] source, int sourceStart, Object[] target, int targetStart, int len)96 public final static boolean arrayRegionMatches(Object[] source, int sourceStart, 97 Object[] target, int targetStart, 98 int len) 99 { 100 int sourceEnd = sourceStart + len; 101 int delta = targetStart - sourceStart; 102 for (int i = sourceStart; i < sourceEnd; i++) { 103 if (!arrayEquals(source[i],target[i + delta])) 104 return false; 105 } 106 return true; 107 } 108 109 /** 110 * Convenience utility to compare two Object[]s 111 * Ought to be in System. 112 * @param len the length to compare. 113 * The start indices and start+len must be valid. 114 */ arrayRegionMatches(char[] source, int sourceStart, char[] target, int targetStart, int len)115 public final static boolean arrayRegionMatches(char[] source, int sourceStart, 116 char[] target, int targetStart, 117 int len) 118 { 119 int sourceEnd = sourceStart + len; 120 int delta = targetStart - sourceStart; 121 for (int i = sourceStart; i < sourceEnd; i++) { 122 if (source[i]!=target[i + delta]) 123 return false; 124 } 125 return true; 126 } 127 128 /** 129 * Convenience utility to compare two int[]s. 130 * @param len the length to compare. 131 * The start indices and start+len must be valid. 132 * Ought to be in System 133 */ arrayRegionMatches(int[] source, int sourceStart, int[] target, int targetStart, int len)134 public final static boolean arrayRegionMatches(int[] source, int sourceStart, 135 int[] target, int targetStart, 136 int len) 137 { 138 int sourceEnd = sourceStart + len; 139 int delta = targetStart - sourceStart; 140 for (int i = sourceStart; i < sourceEnd; i++) { 141 if (source[i] != target[i + delta]) 142 return false; 143 } 144 return true; 145 } 146 147 /** 148 * Convenience utility to compare two arrays of doubles. 149 * @param len the length to compare. 150 * The start indices and start+len must be valid. 151 * Ought to be in System 152 */ arrayRegionMatches(double[] source, int sourceStart, double[] target, int targetStart, int len)153 public final static boolean arrayRegionMatches(double[] source, int sourceStart, 154 double[] target, int targetStart, 155 int len) 156 { 157 int sourceEnd = sourceStart + len; 158 int delta = targetStart - sourceStart; 159 for (int i = sourceStart; i < sourceEnd; i++) { 160 if (source[i] != target[i + delta]) 161 return false; 162 } 163 return true; 164 } arrayRegionMatches(byte[] source, int sourceStart, byte[] target, int targetStart, int len)165 public final static boolean arrayRegionMatches(byte[] source, int sourceStart, 166 byte[] target, int targetStart, int len){ 167 int sourceEnd = sourceStart + len; 168 int delta = targetStart - sourceStart; 169 for (int i = sourceStart; i < sourceEnd; i++) { 170 if (source[i] != target[i + delta]) 171 return false; 172 } 173 return true; 174 } 175 176 /** 177 * Trivial reference equality. 178 * This method should help document that we really want == not equals(), 179 * and to have a single place to suppress warnings from static analysis tools. 180 */ sameObjects(Object a, Object b)181 public static final boolean sameObjects(Object a, Object b) { 182 return a == b; 183 } 184 185 /** 186 * Convenience utility. Does null checks on objects, then calls compare. 187 */ checkCompare(T a, T b)188 public static <T extends Comparable<T>> int checkCompare(T a, T b) { 189 return a == null ? 190 b == null ? 0 : -1 : 191 b == null ? 1 : a.compareTo(b); 192 } 193 194 /** 195 * Convenience utility. Does null checks on object, then calls hashCode. 196 */ checkHash(Object a)197 public static int checkHash(Object a) { 198 return a == null ? 0 : a.hashCode(); 199 } 200 201 /** 202 * The ESCAPE character is used during run-length encoding. It signals 203 * a run of identical chars. 204 */ 205 private static final char ESCAPE = '\uA5A5'; 206 207 /** 208 * The ESCAPE_BYTE character is used during run-length encoding. It signals 209 * a run of identical bytes. 210 */ 211 static final byte ESCAPE_BYTE = (byte)0xA5; 212 213 /** 214 * Construct a string representing an int array. Use run-length encoding. 215 * A character represents itself, unless it is the ESCAPE character. Then 216 * the following notations are possible: 217 * ESCAPE ESCAPE ESCAPE literal 218 * ESCAPE n c n instances of character c 219 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 220 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 221 * If we encounter a run where n == ESCAPE, we represent this as: 222 * c ESCAPE n-1 c 223 * The ESCAPE value is chosen so as not to collide with commonly 224 * seen values. 225 */ arrayToRLEString(int[] a)226 static public final String arrayToRLEString(int[] a) { 227 StringBuilder buffer = new StringBuilder(); 228 229 appendInt(buffer, a.length); 230 int runValue = a[0]; 231 int runLength = 1; 232 for (int i=1; i<a.length; ++i) { 233 int s = a[i]; 234 if (s == runValue && runLength < 0xFFFF) { 235 ++runLength; 236 } else { 237 encodeRun(buffer, runValue, runLength); 238 runValue = s; 239 runLength = 1; 240 } 241 } 242 encodeRun(buffer, runValue, runLength); 243 return buffer.toString(); 244 } 245 246 /** 247 * Construct a string representing a short array. Use run-length encoding. 248 * A character represents itself, unless it is the ESCAPE character. Then 249 * the following notations are possible: 250 * ESCAPE ESCAPE ESCAPE literal 251 * ESCAPE n c n instances of character c 252 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 253 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 254 * If we encounter a run where n == ESCAPE, we represent this as: 255 * c ESCAPE n-1 c 256 * The ESCAPE value is chosen so as not to collide with commonly 257 * seen values. 258 */ arrayToRLEString(short[] a)259 static public final String arrayToRLEString(short[] a) { 260 StringBuilder buffer = new StringBuilder(); 261 // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]); 262 buffer.append((char) (a.length >> 16)); 263 buffer.append((char) a.length); 264 short runValue = a[0]; 265 int runLength = 1; 266 for (int i=1; i<a.length; ++i) { 267 short s = a[i]; 268 if (s == runValue && runLength < 0xFFFF) ++runLength; 269 else { 270 encodeRun(buffer, runValue, runLength); 271 runValue = s; 272 runLength = 1; 273 } 274 } 275 encodeRun(buffer, runValue, runLength); 276 return buffer.toString(); 277 } 278 279 /** 280 * Construct a string representing a char array. Use run-length encoding. 281 * A character represents itself, unless it is the ESCAPE character. Then 282 * the following notations are possible: 283 * ESCAPE ESCAPE ESCAPE literal 284 * ESCAPE n c n instances of character c 285 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 286 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 287 * If we encounter a run where n == ESCAPE, we represent this as: 288 * c ESCAPE n-1 c 289 * The ESCAPE value is chosen so as not to collide with commonly 290 * seen values. 291 */ arrayToRLEString(char[] a)292 static public final String arrayToRLEString(char[] a) { 293 StringBuilder buffer = new StringBuilder(); 294 buffer.append((char) (a.length >> 16)); 295 buffer.append((char) a.length); 296 char runValue = a[0]; 297 int runLength = 1; 298 for (int i=1; i<a.length; ++i) { 299 char s = a[i]; 300 if (s == runValue && runLength < 0xFFFF) ++runLength; 301 else { 302 encodeRun(buffer, (short)runValue, runLength); 303 runValue = s; 304 runLength = 1; 305 } 306 } 307 encodeRun(buffer, (short)runValue, runLength); 308 return buffer.toString(); 309 } 310 311 /** 312 * Construct a string representing a byte array. Use run-length encoding. 313 * Two bytes are packed into a single char, with a single extra zero byte at 314 * the end if needed. A byte represents itself, unless it is the 315 * ESCAPE_BYTE. Then the following notations are possible: 316 * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal 317 * ESCAPE_BYTE n b n instances of byte b 318 * Since an encoded run occupies 3 bytes, we only encode runs of 4 or 319 * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF. 320 * If we encounter a run where n == ESCAPE_BYTE, we represent this as: 321 * b ESCAPE_BYTE n-1 b 322 * The ESCAPE_BYTE value is chosen so as not to collide with commonly 323 * seen values. 324 */ arrayToRLEString(byte[] a)325 static public final String arrayToRLEString(byte[] a) { 326 StringBuilder buffer = new StringBuilder(); 327 buffer.append((char) (a.length >> 16)); 328 buffer.append((char) a.length); 329 byte runValue = a[0]; 330 int runLength = 1; 331 byte[] state = new byte[2]; 332 for (int i=1; i<a.length; ++i) { 333 byte b = a[i]; 334 if (b == runValue && runLength < 0xFF) ++runLength; 335 else { 336 encodeRun(buffer, runValue, runLength, state); 337 runValue = b; 338 runLength = 1; 339 } 340 } 341 encodeRun(buffer, runValue, runLength, state); 342 343 // We must save the final byte, if there is one, by padding 344 // an extra zero. 345 if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state); 346 347 return buffer.toString(); 348 } 349 350 /** 351 * Encode a run, possibly a degenerate run (of < 4 values). 352 * @param length The length of the run; must be > 0 && <= 0xFFFF. 353 */ encodeRun(T buffer, int value, int length)354 private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) { 355 if (length < 4) { 356 for (int j=0; j<length; ++j) { 357 if (value == ESCAPE) { 358 appendInt(buffer, value); 359 } 360 appendInt(buffer, value); 361 } 362 } 363 else { 364 if (length == ESCAPE) { 365 if (value == ESCAPE) { 366 appendInt(buffer, ESCAPE); 367 } 368 appendInt(buffer, value); 369 --length; 370 } 371 appendInt(buffer, ESCAPE); 372 appendInt(buffer, length); 373 appendInt(buffer, value); // Don't need to escape this value 374 } 375 } 376 appendInt(T buffer, int value)377 private static final <T extends Appendable> void appendInt(T buffer, int value) { 378 try { 379 buffer.append((char)(value >>> 16)); 380 buffer.append((char)(value & 0xFFFF)); 381 } catch (IOException e) { 382 throw new IllegalIcuArgumentException(e); 383 } 384 } 385 386 /** 387 * Encode a run, possibly a degenerate run (of < 4 values). 388 * @param length The length of the run; must be > 0 && <= 0xFFFF. 389 */ encodeRun(T buffer, short value, int length)390 private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) { 391 try { 392 char valueChar = (char) value; 393 if (length < 4) { 394 for (int j=0; j<length; ++j) { 395 if (valueChar == ESCAPE) { 396 buffer.append(ESCAPE); 397 } 398 buffer.append(valueChar); 399 } 400 } 401 else { 402 if (length == ESCAPE) { 403 if (valueChar == ESCAPE) { 404 buffer.append(ESCAPE); 405 } 406 buffer.append(valueChar); 407 --length; 408 } 409 buffer.append(ESCAPE); 410 buffer.append((char) length); 411 buffer.append(valueChar); // Don't need to escape this value 412 } 413 } catch (IOException e) { 414 throw new IllegalIcuArgumentException(e); 415 } 416 } 417 418 /** 419 * Encode a run, possibly a degenerate run (of < 4 values). 420 * @param length The length of the run; must be > 0 && <= 0xFF. 421 */ encodeRun(T buffer, byte value, int length, byte[] state)422 private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length, 423 byte[] state) { 424 if (length < 4) { 425 for (int j=0; j<length; ++j) { 426 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 427 appendEncodedByte(buffer, value, state); 428 } 429 } 430 else { 431 if ((byte)length == ESCAPE_BYTE) { 432 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 433 appendEncodedByte(buffer, value, state); 434 --length; 435 } 436 appendEncodedByte(buffer, ESCAPE_BYTE, state); 437 appendEncodedByte(buffer, (byte)length, state); 438 appendEncodedByte(buffer, value, state); // Don't need to escape this value 439 } 440 } 441 442 /** 443 * Append a byte to the given Appendable, packing two bytes into each 444 * character. The state parameter maintains intermediary data between 445 * calls. 446 * @param state A two-element array, with state[0] == 0 if this is the 447 * first byte of a pair, or state[0] != 0 if this is the second byte 448 * of a pair, in which case state[1] is the first byte. 449 */ appendEncodedByte(T buffer, byte value, byte[] state)450 private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value, 451 byte[] state) { 452 try { 453 if (state[0] != 0) { 454 char c = (char) ((state[1] << 8) | ((value) & 0xFF)); 455 buffer.append(c); 456 state[0] = 0; 457 } 458 else { 459 state[0] = 1; 460 state[1] = value; 461 } 462 } catch (IOException e) { 463 throw new IllegalIcuArgumentException(e); 464 } 465 } 466 467 /** 468 * Construct an array of ints from a run-length encoded string. 469 */ RLEStringToIntArray(String s)470 static public final int[] RLEStringToIntArray(String s) { 471 int length = getInt(s, 0); 472 int[] array = new int[length]; 473 int ai = 0, i = 1; 474 475 int maxI = s.length() / 2; 476 while (ai < length && i < maxI) { 477 int c = getInt(s, i++); 478 479 if (c == ESCAPE) { 480 c = getInt(s, i++); 481 if (c == ESCAPE) { 482 array[ai++] = c; 483 } else { 484 int runLength = c; 485 int runValue = getInt(s, i++); 486 for (int j=0; j<runLength; ++j) { 487 array[ai++] = runValue; 488 } 489 } 490 } 491 else { 492 array[ai++] = c; 493 } 494 } 495 496 if (ai != length || i != maxI) { 497 throw new IllegalStateException("Bad run-length encoded int array"); 498 } 499 500 return array; 501 } getInt(String s, int i)502 static final int getInt(String s, int i) { 503 return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1); 504 } 505 506 /** 507 * Construct an array of shorts from a run-length encoded string. 508 */ RLEStringToShortArray(String s)509 static public final short[] RLEStringToShortArray(String s) { 510 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 511 short[] array = new short[length]; 512 int ai = 0; 513 for (int i=2; i<s.length(); ++i) { 514 char c = s.charAt(i); 515 if (c == ESCAPE) { 516 c = s.charAt(++i); 517 if (c == ESCAPE) { 518 array[ai++] = (short) c; 519 } else { 520 int runLength = c; 521 short runValue = (short) s.charAt(++i); 522 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 523 } 524 } 525 else { 526 array[ai++] = (short) c; 527 } 528 } 529 530 if (ai != length) 531 throw new IllegalStateException("Bad run-length encoded short array"); 532 533 return array; 534 } 535 536 /** 537 * Construct an array of shorts from a run-length encoded string. 538 */ RLEStringToCharArray(String s)539 static public final char[] RLEStringToCharArray(String s) { 540 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 541 char[] array = new char[length]; 542 int ai = 0; 543 for (int i=2; i<s.length(); ++i) { 544 char c = s.charAt(i); 545 if (c == ESCAPE) { 546 c = s.charAt(++i); 547 if (c == ESCAPE) { 548 array[ai++] = c; 549 } else { 550 int runLength = c; 551 char runValue = s.charAt(++i); 552 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 553 } 554 } 555 else { 556 array[ai++] = c; 557 } 558 } 559 560 if (ai != length) 561 throw new IllegalStateException("Bad run-length encoded short array"); 562 563 return array; 564 } 565 566 /** 567 * Construct an array of bytes from a run-length encoded string. 568 */ RLEStringToByteArray(String s)569 static public final byte[] RLEStringToByteArray(String s) { 570 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 571 byte[] array = new byte[length]; 572 boolean nextChar = true; 573 char c = 0; 574 int node = 0; 575 int runLength = 0; 576 int i = 2; 577 for (int ai=0; ai<length; ) { 578 // This part of the loop places the next byte into the local 579 // variable 'b' each time through the loop. It keeps the 580 // current character in 'c' and uses the boolean 'nextChar' 581 // to see if we've taken both bytes out of 'c' yet. 582 byte b; 583 if (nextChar) { 584 c = s.charAt(i++); 585 b = (byte) (c >> 8); 586 nextChar = false; 587 } 588 else { 589 b = (byte) (c & 0xFF); 590 nextChar = true; 591 } 592 593 // This part of the loop is a tiny state machine which handles 594 // the parsing of the run-length encoding. This would be simpler 595 // if we could look ahead, but we can't, so we use 'node' to 596 // move between three nodes in the state machine. 597 switch (node) { 598 case 0: 599 // Normal idle node 600 if (b == ESCAPE_BYTE) { 601 node = 1; 602 } 603 else { 604 array[ai++] = b; 605 } 606 break; 607 case 1: 608 // We have seen one ESCAPE_BYTE; we expect either a second 609 // one, or a run length and value. 610 if (b == ESCAPE_BYTE) { 611 array[ai++] = ESCAPE_BYTE; 612 node = 0; 613 } 614 else { 615 runLength = b; 616 // Interpret signed byte as unsigned 617 if (runLength < 0) runLength += 0x100; 618 node = 2; 619 } 620 break; 621 case 2: 622 // We have seen an ESCAPE_BYTE and length byte. We interpret 623 // the next byte as the value to be repeated. 624 for (int j=0; j<runLength; ++j) array[ai++] = b; 625 node = 0; 626 break; 627 } 628 } 629 630 if (node != 0) 631 throw new IllegalStateException("Bad run-length encoded byte array"); 632 633 if (i != s.length()) 634 throw new IllegalStateException("Excess data in RLE byte array string"); 635 636 return array; 637 } 638 639 static public String LINE_SEPARATOR = System.getProperty("line.separator"); 640 641 /** 642 * Format a String for representation in a source file. This includes 643 * breaking it into lines and escaping characters using octal notation 644 * when necessary (control characters and double quotes). 645 */ formatForSource(String s)646 static public final String formatForSource(String s) { 647 StringBuilder buffer = new StringBuilder(); 648 for (int i=0; i<s.length();) { 649 if (i > 0) buffer.append('+').append(LINE_SEPARATOR); 650 buffer.append(" \""); 651 int count = 11; 652 while (i<s.length() && count<80) { 653 char c = s.charAt(i++); 654 if (c < '\u0020' || c == '"' || c == '\\') { 655 if (c == '\n') { 656 buffer.append("\\n"); 657 count += 2; 658 } else if (c == '\t') { 659 buffer.append("\\t"); 660 count += 2; 661 } else if (c == '\r') { 662 buffer.append("\\r"); 663 count += 2; 664 } else { 665 // Represent control characters, backslash and double quote 666 // using octal notation; otherwise the string we form 667 // won't compile, since Unicode escape sequences are 668 // processed before tokenization. 669 buffer.append('\\'); 670 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 671 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 672 buffer.append(HEX_DIGIT[(c & 0007)]); 673 count += 4; 674 } 675 } 676 else if (c <= '\u007E') { 677 buffer.append(c); 678 count += 1; 679 } 680 else { 681 buffer.append("\\u"); 682 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 683 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 684 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 685 buffer.append(HEX_DIGIT[(c & 0x000F)]); 686 count += 6; 687 } 688 } 689 buffer.append('"'); 690 } 691 return buffer.toString(); 692 } 693 694 static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7', 695 '8','9','A','B','C','D','E','F'}; 696 697 /** 698 * Format a String for representation in a source file. Like 699 * formatForSource but does not do line breaking. 700 */ format1ForSource(String s)701 static public final String format1ForSource(String s) { 702 StringBuilder buffer = new StringBuilder(); 703 buffer.append("\""); 704 for (int i=0; i<s.length();) { 705 char c = s.charAt(i++); 706 if (c < '\u0020' || c == '"' || c == '\\') { 707 if (c == '\n') { 708 buffer.append("\\n"); 709 } else if (c == '\t') { 710 buffer.append("\\t"); 711 } else if (c == '\r') { 712 buffer.append("\\r"); 713 } else { 714 // Represent control characters, backslash and double quote 715 // using octal notation; otherwise the string we form 716 // won't compile, since Unicode escape sequences are 717 // processed before tokenization. 718 buffer.append('\\'); 719 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 720 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 721 buffer.append(HEX_DIGIT[(c & 0007)]); 722 } 723 } 724 else if (c <= '\u007E') { 725 buffer.append(c); 726 } 727 else { 728 buffer.append("\\u"); 729 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 730 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 731 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 732 buffer.append(HEX_DIGIT[(c & 0x000F)]); 733 } 734 } 735 buffer.append('"'); 736 return buffer.toString(); 737 } 738 739 /** 740 * Convert characters outside the range U+0020 to U+007F to 741 * Unicode escapes, and convert backslash to a double backslash. 742 */ escape(String s)743 public static final String escape(String s) { 744 StringBuilder buf = new StringBuilder(); 745 for (int i=0; i<s.length(); ) { 746 int c = Character.codePointAt(s, i); 747 i += UTF16.getCharCount(c); 748 if (c >= ' ' && c <= 0x007F) { 749 if (c == '\\') { 750 buf.append("\\\\"); // That is, "\\" 751 } else { 752 buf.append((char)c); 753 } 754 } else { 755 boolean four = c <= 0xFFFF; 756 buf.append(four ? "\\u" : "\\U"); 757 buf.append(hex(c, four ? 4 : 8)); 758 } 759 } 760 return buf.toString(); 761 } 762 763 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 764 static private final char[] UNESCAPE_MAP = { 765 /*" 0x22, 0x22 */ 766 /*' 0x27, 0x27 */ 767 /*? 0x3F, 0x3F */ 768 /*\ 0x5C, 0x5C */ 769 /*a*/ 0x61, 0x07, 770 /*b*/ 0x62, 0x08, 771 /*e*/ 0x65, 0x1b, 772 /*f*/ 0x66, 0x0c, 773 /*n*/ 0x6E, 0x0a, 774 /*r*/ 0x72, 0x0d, 775 /*t*/ 0x74, 0x09, 776 /*v*/ 0x76, 0x0b 777 }; 778 779 /** 780 * Convert an escape to a 32-bit code point value. We attempt 781 * to parallel the icu4c unescapeAt() function. 782 * @param offset16 an array containing offset to the character 783 * <em>after</em> the backslash. Upon return offset16[0] will 784 * be updated to point after the escape sequence. 785 * @return character value from 0 to 10FFFF, or -1 on error. 786 */ unescapeAt(String s, int[] offset16)787 public static int unescapeAt(String s, int[] offset16) { 788 int c; 789 int result = 0; 790 int n = 0; 791 int minDig = 0; 792 int maxDig = 0; 793 int bitsPerDigit = 4; 794 int dig; 795 int i; 796 boolean braces = false; 797 798 /* Check that offset is in range */ 799 int offset = offset16[0]; 800 int length = s.length(); 801 if (offset < 0 || offset >= length) { 802 return -1; 803 } 804 805 /* Fetch first UChar after '\\' */ 806 c = Character.codePointAt(s, offset); 807 offset += UTF16.getCharCount(c); 808 809 /* Convert hexadecimal and octal escapes */ 810 switch (c) { 811 case 'u': 812 minDig = maxDig = 4; 813 break; 814 case 'U': 815 minDig = maxDig = 8; 816 break; 817 case 'x': 818 minDig = 1; 819 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { 820 ++offset; 821 braces = true; 822 maxDig = 8; 823 } else { 824 maxDig = 2; 825 } 826 break; 827 default: 828 dig = UCharacter.digit(c, 8); 829 if (dig >= 0) { 830 minDig = 1; 831 maxDig = 3; 832 n = 1; /* Already have first octal digit */ 833 bitsPerDigit = 3; 834 result = dig; 835 } 836 break; 837 } 838 if (minDig != 0) { 839 while (offset < length && n < maxDig) { 840 c = UTF16.charAt(s, offset); 841 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); 842 if (dig < 0) { 843 break; 844 } 845 result = (result << bitsPerDigit) | dig; 846 offset += UTF16.getCharCount(c); 847 ++n; 848 } 849 if (n < minDig) { 850 return -1; 851 } 852 if (braces) { 853 if (c != 0x7D /*}*/) { 854 return -1; 855 } 856 ++offset; 857 } 858 if (result < 0 || result >= 0x110000) { 859 return -1; 860 } 861 // If an escape sequence specifies a lead surrogate, see 862 // if there is a trail surrogate after it, either as an 863 // escape or as a literal. If so, join them up into a 864 // supplementary. 865 if (offset < length && 866 UTF16.isLeadSurrogate((char) result)) { 867 int ahead = offset+1; 868 c = s.charAt(offset); // [sic] get 16-bit code unit 869 if (c == '\\' && ahead < length) { 870 int o[] = new int[] { ahead }; 871 c = unescapeAt(s, o); 872 ahead = o[0]; 873 } 874 if (UTF16.isTrailSurrogate((char) c)) { 875 offset = ahead; 876 result = Character.toCodePoint((char) result, (char) c); 877 } 878 } 879 offset16[0] = offset; 880 return result; 881 } 882 883 /* Convert C-style escapes in table */ 884 for (i=0; i<UNESCAPE_MAP.length; i+=2) { 885 if (c == UNESCAPE_MAP[i]) { 886 offset16[0] = offset; 887 return UNESCAPE_MAP[i+1]; 888 } else if (c < UNESCAPE_MAP[i]) { 889 break; 890 } 891 } 892 893 /* Map \cX to control-X: X & 0x1F */ 894 if (c == 'c' && offset < length) { 895 c = UTF16.charAt(s, offset); 896 offset16[0] = offset + UTF16.getCharCount(c); 897 return 0x1F & c; 898 } 899 900 /* If no special forms are recognized, then consider 901 * the backslash to generically escape the next character. */ 902 offset16[0] = offset; 903 return c; 904 } 905 906 /** 907 * Convert all escapes in a given string using unescapeAt(). 908 * @exception IllegalArgumentException if an invalid escape is 909 * seen. 910 */ unescape(String s)911 public static String unescape(String s) { 912 StringBuilder buf = new StringBuilder(); 913 int[] pos = new int[1]; 914 for (int i=0; i<s.length(); ) { 915 char c = s.charAt(i++); 916 if (c == '\\') { 917 pos[0] = i; 918 int e = unescapeAt(s, pos); 919 if (e < 0) { 920 throw new IllegalArgumentException("Invalid escape sequence " + 921 s.substring(i-1, Math.min(i+8, s.length()))); 922 } 923 buf.appendCodePoint(e); 924 i = pos[0]; 925 } else { 926 buf.append(c); 927 } 928 } 929 return buf.toString(); 930 } 931 932 /** 933 * Convert all escapes in a given string using unescapeAt(). 934 * Leave invalid escape sequences unchanged. 935 */ unescapeLeniently(String s)936 public static String unescapeLeniently(String s) { 937 StringBuilder buf = new StringBuilder(); 938 int[] pos = new int[1]; 939 for (int i=0; i<s.length(); ) { 940 char c = s.charAt(i++); 941 if (c == '\\') { 942 pos[0] = i; 943 int e = unescapeAt(s, pos); 944 if (e < 0) { 945 buf.append(c); 946 } else { 947 buf.appendCodePoint(e); 948 i = pos[0]; 949 } 950 } else { 951 buf.append(c); 952 } 953 } 954 return buf.toString(); 955 } 956 957 /** 958 * Convert a char to 4 hex uppercase digits. E.g., hex('a') => 959 * "0041". 960 */ hex(long ch)961 public static String hex(long ch) { 962 return hex(ch, 4); 963 } 964 965 /** 966 * Supplies a zero-padded hex representation of an integer (without 0x) 967 */ hex(long i, int places)968 static public String hex(long i, int places) { 969 if (i == Long.MIN_VALUE) return "-8000000000000000"; 970 boolean negative = i < 0; 971 if (negative) { 972 i = -i; 973 } 974 String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); 975 if (result.length() < places) { 976 result = "0000000000000000".substring(result.length(),places) + result; 977 } 978 if (negative) { 979 return '-' + result; 980 } 981 return result; 982 } 983 984 /** 985 * Convert a string to comma-separated groups of 4 hex uppercase 986 * digits. E.g., hex('ab') => "0041,0042". 987 */ 988 public static String hex(CharSequence s) { 989 return hex(s, 4, ",", true, new StringBuilder()).toString(); 990 } 991 992 /** 993 * Convert a string to separated groups of hex uppercase 994 * digits. E.g., hex('ab'...) => "0041,0042". Append the output 995 * to the given Appendable. 996 */ 997 public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) { 998 try { 999 if (useCodePoints) { 1000 int cp; 1001 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1002 cp = Character.codePointAt(s, i); 1003 if (i != 0) { 1004 result.append(separator); 1005 } 1006 result.append(hex(cp,width)); 1007 } 1008 } else { 1009 for (int i = 0; i < s.length(); ++i) { 1010 if (i != 0) { 1011 result.append(separator); 1012 } 1013 result.append(hex(s.charAt(i),width)); 1014 } 1015 } 1016 return result; 1017 } catch (IOException e) { 1018 throw new IllegalIcuArgumentException(e); 1019 } 1020 } 1021 1022 public static String hex(byte[] o, int start, int end, String separator) { 1023 StringBuilder result = new StringBuilder(); 1024 //int ch; 1025 for (int i = start; i < end; ++i) { 1026 if (i != 0) result.append(separator); 1027 result.append(hex(o[i])); 1028 } 1029 return result.toString(); 1030 } 1031 1032 /** 1033 * Convert a string to comma-separated groups of 4 hex uppercase 1034 * digits. E.g., hex('ab') => "0041,0042". 1035 */ 1036 public static <S extends CharSequence> String hex(S s, int width, S separator) { 1037 return hex(s, width, separator, true, new StringBuilder()).toString(); 1038 } 1039 1040 /** 1041 * Split a string into pieces based on the given divider character 1042 * @param s the string to split 1043 * @param divider the character on which to split. Occurrences of 1044 * this character are not included in the output 1045 * @param output an array to receive the substrings between 1046 * instances of divider. It must be large enough on entry to 1047 * accomodate all output. Adjacent instances of the divider 1048 * character will place empty strings into output. Before 1049 * returning, output is padded out with empty strings. 1050 */ 1051 public static void split(String s, char divider, String[] output) { 1052 int last = 0; 1053 int current = 0; 1054 int i; 1055 for (i = 0; i < s.length(); ++i) { 1056 if (s.charAt(i) == divider) { 1057 output[current++] = s.substring(last,i); 1058 last = i+1; 1059 } 1060 } 1061 output[current++] = s.substring(last,i); 1062 while (current < output.length) { 1063 output[current++] = ""; 1064 } 1065 } 1066 1067 /** 1068 * Split a string into pieces based on the given divider character 1069 * @param s the string to split 1070 * @param divider the character on which to split. Occurrences of 1071 * this character are not included in the output 1072 * @return output an array to receive the substrings between 1073 * instances of divider. Adjacent instances of the divider 1074 * character will place empty strings into output. 1075 */ 1076 public static String[] split(String s, char divider) { 1077 int last = 0; 1078 int i; 1079 ArrayList<String> output = new ArrayList<>(); 1080 for (i = 0; i < s.length(); ++i) { 1081 if (s.charAt(i) == divider) { 1082 output.add(s.substring(last,i)); 1083 last = i+1; 1084 } 1085 } 1086 output.add( s.substring(last,i)); 1087 return output.toArray(new String[output.size()]); 1088 } 1089 1090 /** 1091 * Look up a given string in a string array. Returns the index at 1092 * which the first occurrence of the string was found in the 1093 * array, or -1 if it was not found. 1094 * @param source the string to search for 1095 * @param target the array of zero or more strings in which to 1096 * look for source 1097 * @return the index of target at which source first occurs, or -1 1098 * if not found 1099 */ 1100 public static int lookup(String source, String[] target) { 1101 for (int i = 0; i < target.length; ++i) { 1102 if (source.equals(target[i])) return i; 1103 } 1104 return -1; 1105 } 1106 1107 /** 1108 * Parse a single non-whitespace character 'ch', optionally 1109 * preceded by whitespace. 1110 * @param id the string to be parsed 1111 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 1112 * offset of the first character to be parsed. On output, pos[0] 1113 * is the index after the last parsed character. If the parse 1114 * fails, pos[0] will be unchanged. 1115 * @param ch the non-whitespace character to be parsed. 1116 * @return true if 'ch' is seen preceded by zero or more 1117 * whitespace characters. 1118 */ 1119 public static boolean parseChar(String id, int[] pos, char ch) { 1120 int start = pos[0]; 1121 pos[0] = PatternProps.skipWhiteSpace(id, pos[0]); 1122 if (pos[0] == id.length() || 1123 id.charAt(pos[0]) != ch) { 1124 pos[0] = start; 1125 return false; 1126 } 1127 ++pos[0]; 1128 return true; 1129 } 1130 1131 /** 1132 * Parse a pattern string starting at offset pos. Keywords are 1133 * matched case-insensitively. Spaces may be skipped and may be 1134 * optional or required. Integer values may be parsed, and if 1135 * they are, they will be returned in the given array. If 1136 * successful, the offset of the next non-space character is 1137 * returned. On failure, -1 is returned. 1138 * @param pattern must only contain lowercase characters, which 1139 * will match their uppercase equivalents as well. A space 1140 * character matches one or more required spaces. A '~' character 1141 * matches zero or more optional spaces. A '#' character matches 1142 * an integer and stores it in parsedInts, which the caller must 1143 * ensure has enough capacity. 1144 * @param parsedInts array to receive parsed integers. Caller 1145 * must ensure that parsedInts.length is >= the number of '#' 1146 * signs in 'pattern'. 1147 * @return the position after the last character parsed, or -1 if 1148 * the parse failed 1149 */ 1150 @SuppressWarnings("fallthrough") 1151 public static int parsePattern(String rule, int pos, int limit, 1152 String pattern, int[] parsedInts) { 1153 // TODO Update this to handle surrogates 1154 int[] p = new int[1]; 1155 int intCount = 0; // number of integers parsed 1156 for (int i=0; i<pattern.length(); ++i) { 1157 char cpat = pattern.charAt(i); 1158 char c; 1159 switch (cpat) { 1160 case ' ': 1161 if (pos >= limit) { 1162 return -1; 1163 } 1164 c = rule.charAt(pos++); 1165 if (!PatternProps.isWhiteSpace(c)) { 1166 return -1; 1167 } 1168 // FALL THROUGH to skipWhitespace 1169 case '~': 1170 pos = PatternProps.skipWhiteSpace(rule, pos); 1171 break; 1172 case '#': 1173 p[0] = pos; 1174 parsedInts[intCount++] = parseInteger(rule, p, limit); 1175 if (p[0] == pos) { 1176 // Syntax error; failed to parse integer 1177 return -1; 1178 } 1179 pos = p[0]; 1180 break; 1181 default: 1182 if (pos >= limit) { 1183 return -1; 1184 } 1185 c = (char) UCharacter.toLowerCase(rule.charAt(pos++)); 1186 if (c != cpat) { 1187 return -1; 1188 } 1189 break; 1190 } 1191 } 1192 return pos; 1193 } 1194 1195 /** 1196 * Parse a pattern string within the given Replaceable and a parsing 1197 * pattern. Characters are matched literally and case-sensitively 1198 * except for the following special characters: 1199 * 1200 * ~ zero or more Pattern_White_Space chars 1201 * 1202 * If end of pattern is reached with all matches along the way, 1203 * pos is advanced to the first unparsed index and returned. 1204 * Otherwise -1 is returned. 1205 * @param pat pattern that controls parsing 1206 * @param text text to be parsed, starting at index 1207 * @param index offset to first character to parse 1208 * @param limit offset after last character to parse 1209 * @return index after last parsed character, or -1 on parse failure. 1210 */ 1211 public static int parsePattern(String pat, 1212 Replaceable text, 1213 int index, 1214 int limit) { 1215 int ipat = 0; 1216 1217 // empty pattern matches immediately 1218 if (ipat == pat.length()) { 1219 return index; 1220 } 1221 1222 int cpat = Character.codePointAt(pat, ipat); 1223 1224 while (index < limit) { 1225 int c = text.char32At(index); 1226 1227 // parse \s* 1228 if (cpat == '~') { 1229 if (PatternProps.isWhiteSpace(c)) { 1230 index += UTF16.getCharCount(c); 1231 continue; 1232 } else { 1233 if (++ipat == pat.length()) { 1234 return index; // success; c unparsed 1235 } 1236 // fall thru; process c again with next cpat 1237 } 1238 } 1239 1240 // parse literal 1241 else if (c == cpat) { 1242 int n = UTF16.getCharCount(c); 1243 index += n; 1244 ipat += n; 1245 if (ipat == pat.length()) { 1246 return index; // success; c parsed 1247 } 1248 // fall thru; get next cpat 1249 } 1250 1251 // match failure of literal 1252 else { 1253 return -1; 1254 } 1255 1256 cpat = UTF16.charAt(pat, ipat); 1257 } 1258 1259 return -1; // text ended before end of pat 1260 } 1261 1262 /** 1263 * Parse an integer at pos, either of the form \d+ or of the form 1264 * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex, 1265 * or octal format. 1266 * @param pos INPUT-OUTPUT parameter. On input, the first 1267 * character to parse. On output, the character after the last 1268 * parsed character. 1269 */ 1270 public static int parseInteger(String rule, int[] pos, int limit) { 1271 int count = 0; 1272 int value = 0; 1273 int p = pos[0]; 1274 int radix = 10; 1275 1276 if (rule.regionMatches(true, p, "0x", 0, 2)) { 1277 p += 2; 1278 radix = 16; 1279 } else if (p < limit && rule.charAt(p) == '0') { 1280 p++; 1281 count = 1; 1282 radix = 8; 1283 } 1284 1285 while (p < limit) { 1286 int d = UCharacter.digit(rule.charAt(p++), radix); 1287 if (d < 0) { 1288 --p; 1289 break; 1290 } 1291 ++count; 1292 int v = (value * radix) + d; 1293 if (v <= value) { 1294 // If there are too many input digits, at some point 1295 // the value will go negative, e.g., if we have seen 1296 // "0x8000000" already and there is another '0', when 1297 // we parse the next 0 the value will go negative. 1298 return 0; 1299 } 1300 value = v; 1301 } 1302 if (count > 0) { 1303 pos[0] = p; 1304 } 1305 return value; 1306 } 1307 1308 /** 1309 * Parse a Unicode identifier from the given string at the given 1310 * position. Return the identifier, or null if there is no 1311 * identifier. 1312 * @param str the string to parse 1313 * @param pos INPUT-OUPUT parameter. On INPUT, pos[0] is the 1314 * first character to examine. It must be less than str.length(), 1315 * and it must not point to a whitespace character. That is, must 1316 * have pos[0] < str.length(). On 1317 * OUTPUT, the position after the last parsed character. 1318 * @return the Unicode identifier, or null if there is no valid 1319 * identifier at pos[0]. 1320 */ 1321 public static String parseUnicodeIdentifier(String str, int[] pos) { 1322 // assert(pos[0] < str.length()); 1323 StringBuilder buf = new StringBuilder(); 1324 int p = pos[0]; 1325 while (p < str.length()) { 1326 int ch = Character.codePointAt(str, p); 1327 if (buf.length() == 0) { 1328 if (UCharacter.isUnicodeIdentifierStart(ch)) { 1329 buf.appendCodePoint(ch); 1330 } else { 1331 return null; 1332 } 1333 } else { 1334 if (UCharacter.isUnicodeIdentifierPart(ch)) { 1335 buf.appendCodePoint(ch); 1336 } else { 1337 break; 1338 } 1339 } 1340 p += UTF16.getCharCount(ch); 1341 } 1342 pos[0] = p; 1343 return buf.toString(); 1344 } 1345 1346 static final char DIGITS[] = { 1347 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 1348 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 1349 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 1350 'U', 'V', 'W', 'X', 'Y', 'Z' 1351 }; 1352 1353 /** 1354 * Append the digits of a positive integer to the given 1355 * <code>Appendable</code> in the given radix. This is 1356 * done recursively since it is easiest to generate the low- 1357 * order digit first, but it must be appended last. 1358 * 1359 * @param result is the <code>Appendable</code> to append to 1360 * @param n is the positive integer 1361 * @param radix is the radix, from 2 to 36 inclusive 1362 * @param minDigits is the minimum number of digits to append. 1363 */ 1364 private static <T extends Appendable> void recursiveAppendNumber(T result, int n, 1365 int radix, int minDigits) 1366 { 1367 try { 1368 int digit = n % radix; 1369 1370 if (n >= radix || minDigits > 1) { 1371 recursiveAppendNumber(result, n / radix, radix, minDigits - 1); 1372 } 1373 result.append(DIGITS[digit]); 1374 } catch (IOException e) { 1375 throw new IllegalIcuArgumentException(e); 1376 } 1377 } 1378 1379 /** 1380 * Append a number to the given Appendable in the given radix. 1381 * Standard digits '0'-'9' are used and letters 'A'-'Z' for 1382 * radices 11 through 36. 1383 * @param result the digits of the number are appended here 1384 * @param n the number to be converted to digits; may be negative. 1385 * If negative, a '-' is prepended to the digits. 1386 * @param radix a radix from 2 to 36 inclusive. 1387 * @param minDigits the minimum number of digits, not including 1388 * any '-', to produce. Values less than 2 have no effect. One 1389 * digit is always emitted regardless of this parameter. 1390 * @return a reference to result 1391 */ 1392 public static <T extends Appendable> T appendNumber(T result, int n, 1393 int radix, int minDigits) 1394 { 1395 try { 1396 if (radix < 2 || radix > 36) { 1397 throw new IllegalArgumentException("Illegal radix " + radix); 1398 } 1399 1400 1401 int abs = n; 1402 1403 if (n < 0) { 1404 abs = -n; 1405 result.append("-"); 1406 } 1407 1408 recursiveAppendNumber(result, abs, radix, minDigits); 1409 1410 return result; 1411 } catch (IOException e) { 1412 throw new IllegalIcuArgumentException(e); 1413 } 1414 1415 } 1416 1417 /** 1418 * Parse an unsigned 31-bit integer at the given offset. Use 1419 * UCharacter.digit() to parse individual characters into digits. 1420 * @param text the text to be parsed 1421 * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the 1422 * offset within text at which to start parsing; it should point 1423 * to a valid digit. On exit, pos[0] is the offset after the last 1424 * parsed character. If the parse failed, it will be unchanged on 1425 * exit. Must be >= 0 on entry. 1426 * @param radix the radix in which to parse; must be >= 2 and <= 1427 * 36. 1428 * @return a non-negative parsed number, or -1 upon parse failure. 1429 * Parse fails if there are no digits, that is, if pos[0] does not 1430 * point to a valid digit on entry, or if the number to be parsed 1431 * does not fit into a 31-bit unsigned integer. 1432 */ 1433 public static int parseNumber(String text, int[] pos, int radix) { 1434 // assert(pos[0] >= 0); 1435 // assert(radix >= 2); 1436 // assert(radix <= 36); 1437 int n = 0; 1438 int p = pos[0]; 1439 while (p < text.length()) { 1440 int ch = Character.codePointAt(text, p); 1441 int d = UCharacter.digit(ch, radix); 1442 if (d < 0) { 1443 break; 1444 } 1445 n = radix*n + d; 1446 // ASSUME that when a 32-bit integer overflows it becomes 1447 // negative. E.g., 214748364 * 10 + 8 => negative value. 1448 if (n < 0) { 1449 return -1; 1450 } 1451 ++p; 1452 } 1453 if (p == pos[0]) { 1454 return -1; 1455 } 1456 pos[0] = p; 1457 return n; 1458 } 1459 1460 /** 1461 * Return true if the character is NOT printable ASCII. The tab, 1462 * newline and linefeed characters are considered unprintable. 1463 */ 1464 public static boolean isUnprintable(int c) { 1465 //0x20 = 32 and 0x7E = 126 1466 return !(c >= 0x20 && c <= 0x7E); 1467 } 1468 1469 /** 1470 * Escape unprintable characters using <backslash>uxxxx notation 1471 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and 1472 * above. If the character is printable ASCII, then do nothing 1473 * and return FALSE. Otherwise, append the escaped notation and 1474 * return TRUE. 1475 */ 1476 public static <T extends Appendable> boolean escapeUnprintable(T result, int c) { 1477 try { 1478 if (isUnprintable(c)) { 1479 result.append('\\'); 1480 if ((c & ~0xFFFF) != 0) { 1481 result.append('U'); 1482 result.append(DIGITS[0xF&(c>>28)]); 1483 result.append(DIGITS[0xF&(c>>24)]); 1484 result.append(DIGITS[0xF&(c>>20)]); 1485 result.append(DIGITS[0xF&(c>>16)]); 1486 } else { 1487 result.append('u'); 1488 } 1489 result.append(DIGITS[0xF&(c>>12)]); 1490 result.append(DIGITS[0xF&(c>>8)]); 1491 result.append(DIGITS[0xF&(c>>4)]); 1492 result.append(DIGITS[0xF&c]); 1493 return true; 1494 } 1495 return false; 1496 } catch (IOException e) { 1497 throw new IllegalIcuArgumentException(e); 1498 } 1499 } 1500 1501 /** 1502 * Returns the index of the first character in a set, ignoring quoted text. 1503 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be 1504 * found by a search for "h". Unlike String.indexOf(), this method searches 1505 * not for a single character, but for any character of the string 1506 * <code>setOfChars</code>. 1507 * @param text text to be searched 1508 * @param start the beginning index, inclusive; <code>0 <= start 1509 * <= limit</code>. 1510 * @param limit the ending index, exclusive; <code>start <= limit 1511 * <= text.length()</code>. 1512 * @param setOfChars string with one or more distinct characters 1513 * @return Offset of the first character in <code>setOfChars</code> 1514 * found, or -1 if not found. 1515 * @see String#indexOf 1516 */ 1517 public static int quotedIndexOf(String text, int start, int limit, 1518 String setOfChars) { 1519 for (int i=start; i<limit; ++i) { 1520 char c = text.charAt(i); 1521 if (c == BACKSLASH) { 1522 ++i; 1523 } else if (c == APOSTROPHE) { 1524 while (++i < limit 1525 && text.charAt(i) != APOSTROPHE) {} 1526 } else if (setOfChars.indexOf(c) >= 0) { 1527 return i; 1528 } 1529 } 1530 return -1; 1531 } 1532 1533 /** 1534 * Append a character to a rule that is being built up. To flush 1535 * the quoteBuf to rule, make one final call with isLiteral == true. 1536 * If there is no final character, pass in (int)-1 as c. 1537 * @param rule the string to append the character to 1538 * @param c the character to append, or (int)-1 if none. 1539 * @param isLiteral if true, then the given character should not be 1540 * quoted or escaped. Usually this means it is a syntactic element 1541 * such as > or $ 1542 * @param escapeUnprintable if true, then unprintable characters 1543 * should be escaped using escapeUnprintable(). These escapes will 1544 * appear outside of quotes. 1545 * @param quoteBuf a buffer which is used to build up quoted 1546 * substrings. The caller should initially supply an empty buffer, 1547 * and thereafter should not modify the buffer. The buffer should be 1548 * cleared out by, at the end, calling this method with a literal 1549 * character (which may be -1). 1550 */ 1551 public static void appendToRule(StringBuffer rule, 1552 int c, 1553 boolean isLiteral, 1554 boolean escapeUnprintable, 1555 StringBuffer quoteBuf) { 1556 // If we are escaping unprintables, then escape them outside 1557 // quotes. \\u and \\U are not recognized within quotes. The same 1558 // logic applies to literals, but literals are never escaped. 1559 if (isLiteral || 1560 (escapeUnprintable && Utility.isUnprintable(c))) { 1561 if (quoteBuf.length() > 0) { 1562 // We prefer backslash APOSTROPHE to double APOSTROPHE 1563 // (more readable, less similar to ") so if there are 1564 // double APOSTROPHEs at the ends, we pull them outside 1565 // of the quote. 1566 1567 // If the first thing in the quoteBuf is APOSTROPHE 1568 // (doubled) then pull it out. 1569 while (quoteBuf.length() >= 2 && 1570 quoteBuf.charAt(0) == APOSTROPHE && 1571 quoteBuf.charAt(1) == APOSTROPHE) { 1572 rule.append(BACKSLASH).append(APOSTROPHE); 1573 quoteBuf.delete(0, 2); 1574 } 1575 // If the last thing in the quoteBuf is APOSTROPHE 1576 // (doubled) then remove and count it and add it after. 1577 int trailingCount = 0; 1578 while (quoteBuf.length() >= 2 && 1579 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE && 1580 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) { 1581 quoteBuf.setLength(quoteBuf.length()-2); 1582 ++trailingCount; 1583 } 1584 if (quoteBuf.length() > 0) { 1585 rule.append(APOSTROPHE); 1586 rule.append(quoteBuf); 1587 rule.append(APOSTROPHE); 1588 quoteBuf.setLength(0); 1589 } 1590 while (trailingCount-- > 0) { 1591 rule.append(BACKSLASH).append(APOSTROPHE); 1592 } 1593 } 1594 if (c != -1) { 1595 /* Since spaces are ignored during parsing, they are 1596 * emitted only for readability. We emit one here 1597 * only if there isn't already one at the end of the 1598 * rule. 1599 */ 1600 if (c == ' ') { 1601 int len = rule.length(); 1602 if (len > 0 && rule.charAt(len-1) != ' ') { 1603 rule.append(' '); 1604 } 1605 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) { 1606 rule.appendCodePoint(c); 1607 } 1608 } 1609 } 1610 1611 // Escape ' and '\' and don't begin a quote just for them 1612 else if (quoteBuf.length() == 0 && 1613 (c == APOSTROPHE || c == BACKSLASH)) { 1614 rule.append(BACKSLASH).append((char)c); 1615 } 1616 1617 // Specials (printable ascii that isn't [0-9a-zA-Z]) and 1618 // whitespace need quoting. Also append stuff to quotes if we are 1619 // building up a quoted substring already. 1620 else if (quoteBuf.length() > 0 || 1621 (c >= 0x0021 && c <= 0x007E && 1622 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 1623 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 1624 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || 1625 PatternProps.isWhiteSpace(c)) { 1626 quoteBuf.appendCodePoint(c); 1627 // Double ' within a quote 1628 if (c == APOSTROPHE) { 1629 quoteBuf.append((char)c); 1630 } 1631 } 1632 1633 // Otherwise just append 1634 else { 1635 rule.appendCodePoint(c); 1636 } 1637 } 1638 1639 /** 1640 * Append the given string to the rule. Calls the single-character 1641 * version of appendToRule for each character. 1642 */ 1643 public static void appendToRule(StringBuffer rule, 1644 String text, 1645 boolean isLiteral, 1646 boolean escapeUnprintable, 1647 StringBuffer quoteBuf) { 1648 for (int i=0; i<text.length(); ++i) { 1649 // Okay to process in 16-bit code units here 1650 appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf); 1651 } 1652 } 1653 1654 /** 1655 * Given a matcher reference, which may be null, append its 1656 * pattern as a literal to the given rule. 1657 */ 1658 public static void appendToRule(StringBuffer rule, 1659 UnicodeMatcher matcher, 1660 boolean escapeUnprintable, 1661 StringBuffer quoteBuf) { 1662 if (matcher != null) { 1663 appendToRule(rule, matcher.toPattern(escapeUnprintable), 1664 true, escapeUnprintable, quoteBuf); 1665 } 1666 } 1667 1668 /** 1669 * Compares 2 unsigned integers 1670 * @param source 32 bit unsigned integer 1671 * @param target 32 bit unsigned integer 1672 * @return 0 if equals, 1 if source is greater than target and -1 1673 * otherwise 1674 */ 1675 public static final int compareUnsigned(int source, int target) 1676 { 1677 source += MAGIC_UNSIGNED; 1678 target += MAGIC_UNSIGNED; 1679 if (source < target) { 1680 return -1; 1681 } 1682 else if (source > target) { 1683 return 1; 1684 } 1685 return 0; 1686 } 1687 1688 /** 1689 * Find the highest bit in a positive integer. This is done 1690 * by doing a binary search through the bits. 1691 * 1692 * @param n is the integer 1693 * 1694 * @return the bit number of the highest bit, with 0 being 1695 * the low order bit, or -1 if <code>n</code> is not positive 1696 */ 1697 public static final byte highBit(int n) 1698 { 1699 if (n <= 0) { 1700 return -1; 1701 } 1702 1703 byte bit = 0; 1704 1705 if (n >= 1 << 16) { 1706 n >>= 16; 1707 bit += 16; 1708 } 1709 1710 if (n >= 1 << 8) { 1711 n >>= 8; 1712 bit += 8; 1713 } 1714 1715 if (n >= 1 << 4) { 1716 n >>= 4; 1717 bit += 4; 1718 } 1719 1720 if (n >= 1 << 2) { 1721 n >>= 2; 1722 bit += 2; 1723 } 1724 1725 if (n >= 1 << 1) { 1726 n >>= 1; 1727 bit += 1; 1728 } 1729 1730 return bit; 1731 } 1732 /** 1733 * Utility method to take a int[] containing codepoints and return 1734 * a string representation with code units. 1735 */ 1736 public static String valueOf(int[]source){ 1737 // TODO: Investigate why this method is not on UTF16 class 1738 StringBuilder result = new StringBuilder(source.length); 1739 for(int i=0; i<source.length; i++){ 1740 result.appendCodePoint(source[i]); 1741 } 1742 return result.toString(); 1743 } 1744 1745 1746 /** 1747 * Utility to duplicate a string count times 1748 * @param s String to be duplicated. 1749 * @param count Number of times to duplicate a string. 1750 */ 1751 public static String repeat(String s, int count) { 1752 if (count <= 0) return ""; 1753 if (count == 1) return s; 1754 StringBuilder result = new StringBuilder(); 1755 for (int i = 0; i < count; ++i) { 1756 result.append(s); 1757 } 1758 return result.toString(); 1759 } 1760 1761 public static String[] splitString(String src, String target) { 1762 return src.split("\\Q" + target + "\\E"); 1763 } 1764 1765 /** 1766 * Split the string at runs of ascii whitespace characters. 1767 */ 1768 public static String[] splitWhitespace(String src) { 1769 return src.split("\\s+"); 1770 } 1771 1772 /** 1773 * Parse a list of hex numbers and return a string 1774 * @param string String of hex numbers. 1775 * @param minLength Minimal length. 1776 * @param separator Separator. 1777 * @return A string from hex numbers. 1778 */ 1779 public static String fromHex(String string, int minLength, String separator) { 1780 return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+")); 1781 } 1782 1783 /** 1784 * Parse a list of hex numbers and return a string 1785 * @param string String of hex numbers. 1786 * @param minLength Minimal length. 1787 * @param separator Separator. 1788 * @return A string from hex numbers. 1789 */ 1790 public static String fromHex(String string, int minLength, Pattern separator) { 1791 StringBuilder buffer = new StringBuilder(); 1792 String[] parts = separator.split(string); 1793 for (String part : parts) { 1794 if (part.length() < minLength) { 1795 throw new IllegalArgumentException("code point too short: " + part); 1796 } 1797 int cp = Integer.parseInt(part, 16); 1798 buffer.appendCodePoint(cp); 1799 } 1800 return buffer.toString(); 1801 } 1802 1803 /** 1804 * This implementation is equivalent to Java 8+ Math#addExact(int, int) 1805 * @param x the first value 1806 * @param y the second value 1807 * @return the result 1808 */ 1809 public static int addExact(int x, int y) { 1810 int r = x + y; 1811 // HD 2-12 Overflow iff both arguments have the opposite sign of the result 1812 if (((x ^ r) & (y ^ r)) < 0) { 1813 throw new ArithmeticException("integer overflow"); 1814 } 1815 return r; 1816 } 1817 1818 /** 1819 * Returns whether the chars in the two CharSequences are equal. 1820 */ 1821 public static boolean charSequenceEquals(CharSequence a, CharSequence b) { 1822 if (a == b) { 1823 return true; 1824 } 1825 if (a == null || b == null) { 1826 return false; 1827 } 1828 if (a.length() != b.length()) { 1829 return false; 1830 } 1831 for (int i = 0; i < a.length(); i++) { 1832 if (a.charAt(i) != b.charAt(i)) 1833 return false; 1834 } 1835 return true; 1836 } 1837 1838 /** 1839 * Returns a hash code for a CharSequence that is equivalent to calling 1840 * charSequence.toString().hashCode() 1841 */ 1842 public static int charSequenceHashCode(CharSequence value) { 1843 int hash = 0; 1844 for (int i = 0; i < value.length(); i++) { 1845 hash = hash * 31 + value.charAt(i); 1846 } 1847 return hash; 1848 } 1849 1850 /** 1851 * Appends a CharSequence to an Appendable, converting IOException to ICUUncheckedIOException. 1852 */ 1853 public static <A extends Appendable> A appendTo(CharSequence string, A appendable) { 1854 try { 1855 appendable.append(string); 1856 return appendable; 1857 } catch (IOException e) { 1858 throw new ICUUncheckedIOException(e); 1859 } 1860 } 1861 } 1862