1 /* 2 * Copyright (C) 2000 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.android.mail.lib.base; 17 18 import static com.android.mail.lib.base.Preconditions.checkArgument; 19 20 import com.google.common.base.Joiner; 21 import com.google.common.base.Joiner.MapJoiner; 22 23 import java.io.IOException; 24 import java.io.InputStream; 25 import java.io.StringWriter; 26 import java.util.ArrayList; 27 import java.util.Collection; 28 import java.util.Collections; 29 import java.util.HashMap; 30 import java.util.HashSet; 31 import java.util.Iterator; 32 import java.util.LinkedHashMap; 33 import java.util.LinkedList; 34 import java.util.List; 35 import java.util.Map; 36 import java.util.Set; 37 import java.util.StringTokenizer; 38 import java.util.regex.Matcher; 39 import java.util.regex.Pattern; 40 41 /** 42 * Static utility methods and constants pertaining to {@code String} or {@code 43 * CharSequence} instances. 44 */ 45 public final class StringUtil { StringUtil()46 private StringUtil() {} // COV_NF_LINE 47 48 /** 49 * A completely arbitrary selection of eight whitespace characters. See 50 * <a href="http://go/white+space">this spreadsheet</a> for more details 51 * about whitespace characters. 52 * 53 * @deprecated Rewrite your code to use {@link CharMatcher#WHITESPACE}, or 54 * consider the precise set of characters you want to match and construct 55 * the right explicit {@link CharMatcher} or {@link String} for your own 56 * purposes. 57 */ 58 @Deprecated 59 public static final String WHITE_SPACES = " \r\n\t\u3000\u00A0\u2007\u202F"; 60 61 /** A string containing the carriage return and linefeed characters. */ 62 public static final String LINE_BREAKS = "\r\n"; 63 64 /** 65 * Old location of {@link Strings#isNullOrEmpty}; this method will be 66 * deprecated soon. 67 */ isEmpty(String string)68 public static boolean isEmpty(String string) { 69 return Strings.isNullOrEmpty(string); 70 } 71 72 /** 73 * Returns {@code true} if the given string is null, empty, or comprises only 74 * whitespace characters, as defined by {@link CharMatcher#WHITESPACE}. 75 * 76 * <p><b>Warning:</b> there are many competing definitions of "whitespace"; 77 * please see <a href="http://go/white+space">this spreadsheet</a> for 78 * details. 79 * 80 * @param string the string reference to check 81 * @return {@code true} if {@code string} is null, empty, or consists of 82 * whitespace characters only 83 */ isEmptyOrWhitespace(String string)84 public static boolean isEmptyOrWhitespace(String string) { 85 return string == null || CharMatcher.WHITESPACE.matchesAllOf(string); 86 } 87 88 /** 89 * Old location of {@link Strings#nullToEmpty}; this method will be 90 * deprecated soon. 91 */ makeSafe(String string)92 public static String makeSafe(String string) { 93 return Strings.nullToEmpty(string); 94 } 95 96 /** 97 * Old location of {@link Strings#emptyToNull}; this method will be 98 * deprecated soon. 99 */ toNullIfEmpty(String string)100 public static String toNullIfEmpty(String string) { 101 return Strings.emptyToNull(string); 102 } 103 104 /** 105 * Returns the given string if it is nonempty and contains at least one 106 * non-whitespace character; {@code null} otherwise. See comment in {@link 107 * #isEmptyOrWhitespace} on the definition of whitespace. 108 * 109 * @param string the string to test and possibly return 110 * @return {@code null} if {@code string} is null, empty, or contains only 111 * whitespace characters; {@code string} itself otherwise 112 */ toNullIfEmptyOrWhitespace( String string)113 public static String toNullIfEmptyOrWhitespace( 114 String string) { 115 return isEmptyOrWhitespace(string) ? null : string; 116 } 117 118 /** 119 * Old location of {@link Strings#repeat}; this method will be deprecated 120 * soon. 121 */ repeat(String string, int count)122 public static String repeat(String string, int count) { 123 return Strings.repeat(string, count); 124 } 125 126 /** 127 * Return the first index in the string of any of the specified characters, 128 * starting at a given index, or {@code -1} if none of the characters is 129 * present. 130 * 131 * @param string the non-null character sequence to look in 132 * @param chars a non-null character sequence containing the set of characters 133 * to look for. If empty, this method will find no matches and return 134 * {@code -1} 135 * @param fromIndex the index of the first character to examine in the input 136 * string. If negative, the entire string will be searched. If greater 137 * than or equal to the string length, no characters will be searched and 138 * {@code -1} will be returned. 139 * @return the index of the first match, or {@code -1} if no match was found. 140 * Guaranteed to be either {@code -1} or a number greater than or equal to 141 * {@code fromIndex} 142 * @throws NullPointerException if any argument is null 143 */ 144 // author: pault indexOfChars( CharSequence string, CharSequence chars, int fromIndex)145 public static int indexOfChars( 146 CharSequence string, CharSequence chars, int fromIndex) { 147 if (fromIndex >= string.length()) { 148 return -1; 149 } 150 151 /* 152 * Prepare lookup structures for the characters. TODO(pault): This loop 153 * could be factored into another method to allow caching of the resulting 154 * struct if a use-case of very large character sets exists. 155 */ 156 Set<Character> charSet = Collections.emptySet(); 157 boolean[] charArray = new boolean[128]; 158 for (int i = 0; i < chars.length(); i++) { 159 char c = chars.charAt(i); 160 if (c < 128) { 161 charArray[c] = true; 162 } else { 163 if (charSet.isEmpty()) { 164 charSet = new HashSet<Character>(); 165 } 166 charSet.add(c); 167 } 168 } 169 170 // Scan the string for matches 171 for (int i = Math.max(fromIndex, 0); i < string.length(); i++) { 172 char c = string.charAt(i); 173 if (c < 128) { 174 if (charArray[c]) { 175 return i; 176 } 177 } else if (charSet.contains(c)) { 178 return i; 179 } 180 } 181 return -1; 182 } 183 184 /* 185 * ------------------------------------------------------------------- 186 * This marks the end of the code that has been written or rewritten 187 * in 2008 to the quality standards of the Java core libraries group. 188 * Code below this point is still awaiting cleanup (you can help!). 189 * See http://wiki/Nonconf/JavaCoreLibrariesStandards. 190 * ------------------------------------------------------------------- 191 */ 192 193 194 /** 195 * @param str the string to split. Must not be null. 196 * @param delims the delimiter characters. Each character in the 197 * string is individually treated as a delimiter. 198 * @return an array of tokens. Will not return null. Individual tokens 199 * do not have leading/trailing whitespace removed. 200 * @deprecated see the detailed instructions under 201 * {@link #split(String, String, boolean)} 202 */ 203 @Deprecated split(String str, String delims)204 public static String[] split(String str, String delims) { 205 return split(str, delims, false); 206 } 207 208 /** 209 * This method is deprecated because it is too inflexible, providing 210 * only a very specific set of behaviors that almost never matches exactly 211 * what you intend. Prefer using a {@link Splitter}, which is more flexible 212 * and consistent in the way it handles trimming and empty tokens. 213 * 214 * <ul> 215 * <li>Create a {@link Splitter} using {@link Splitter#on(CharMatcher)} such 216 * as {@code Splitter.on(CharMatcher.anyOf(delims))}. 217 * <li><i>If</i> you need whitespace trimmed from the ends of each segment, 218 * adding {@code .trimResults()} to your splitter definition should work 219 * in most cases. To match the exact behavior of this method, use 220 * {@code .trimResults(CharMatcher.inRange('\0', ' '))}. 221 * <li>This method silently ignores empty tokens in the input, but allows 222 * empty tokens to appear in the output if {@code trimTokens} is 223 * {@code true}. Adding {@code .omitEmptyStrings()} to your splitter 224 * definition will filter empty tokens out but will do so <i>after</i> 225 * having performed trimming. If you absolutely require this method's 226 * behavior in this respect, Splitter is not able to match it. 227 * <li>If you need the result as an array, use {@link 228 * com.google.common.collect.Iterables#toArray(Iterable, Class)} on the 229 * {@code Iterable<String>} returned by {@link Splitter#split}. 230 * </ul> 231 * 232 * @param str the string to split. Must not be null. 233 * @param delims the delimiter characters. Each character in the string 234 * is individually treated as a delimiter. 235 * @param trimTokens if true, leading/trailing whitespace is removed 236 * from the tokens 237 * @return an array of tokens. Will not return null. 238 * @deprecated 239 */ 240 @Deprecated split( String str, String delims, boolean trimTokens)241 public static String[] split( 242 String str, String delims, boolean trimTokens) { 243 StringTokenizer tokenizer = new StringTokenizer(str, delims); 244 int n = tokenizer.countTokens(); 245 String[] list = new String[n]; 246 for (int i = 0; i < n; i++) { 247 if (trimTokens) { 248 list[i] = tokenizer.nextToken().trim(); 249 } else { 250 list[i] = tokenizer.nextToken(); 251 } 252 } 253 return list; 254 } 255 256 /** 257 * Trim characters from only the beginning of a string. 258 * This is a convenience method, it simply calls trimStart(s, null). 259 * 260 * @param s String to be trimmed 261 * @return String with whitespace characters removed from the beginning 262 */ trimStart(String s)263 public static String trimStart(String s) { 264 return trimStart(s, null); 265 } 266 267 /** 268 * Trim characters from only the beginning of a string. 269 * This method will remove all whitespace characters 270 * (defined by Character.isWhitespace(char), in addition to the characters 271 * provided, from the end of the provided string. 272 * 273 * @param s String to be trimmed 274 * @param extraChars Characters in addition to whitespace characters that 275 * should be trimmed. May be null. 276 * @return String with whitespace and characters in extraChars removed 277 * from the beginning 278 */ trimStart(String s, String extraChars)279 public static String trimStart(String s, String extraChars) { 280 int trimCount = 0; 281 while (trimCount < s.length()) { 282 char ch = s.charAt(trimCount); 283 if (Character.isWhitespace(ch) 284 || (extraChars != null && extraChars.indexOf(ch) >= 0)) { 285 trimCount++; 286 } else { 287 break; 288 } 289 } 290 291 if (trimCount == 0) { 292 return s; 293 } 294 return s.substring(trimCount); 295 } 296 297 /** 298 * Trim characters from only the end of a string. 299 * This is a convenience method, it simply calls trimEnd(s, null). 300 * 301 * @param s String to be trimmed 302 * @return String with whitespace characters removed from the end 303 */ trimEnd(String s)304 public static String trimEnd(String s) { 305 return trimEnd(s, null); 306 } 307 308 /** 309 * Trim characters from only the end of a string. 310 * This method will remove all whitespace characters 311 * (defined by Character.isWhitespace(char), in addition to the characters 312 * provided, from the end of the provided string. 313 * 314 * @param s String to be trimmed 315 * @param extraChars Characters in addition to whitespace characters that 316 * should be trimmed. May be null. 317 * @return String with whitespace and characters in extraChars removed 318 * from the end 319 */ trimEnd(String s, String extraChars)320 public static String trimEnd(String s, String extraChars) { 321 int trimCount = 0; 322 while (trimCount < s.length()) { 323 char ch = s.charAt(s.length() - trimCount - 1); 324 if (Character.isWhitespace(ch) 325 || (extraChars != null && extraChars.indexOf(ch) >= 0)) { 326 trimCount++; 327 } else { 328 break; 329 } 330 } 331 332 if (trimCount == 0) { 333 return s; 334 } 335 return s.substring(0, s.length() - trimCount); 336 } 337 338 /** 339 * @param str the string to split. Must not be null. 340 * @param delims the delimiter characters. Each character in the 341 * string is individually treated as a delimiter. 342 * @return an array of tokens. Will not return null. Leading/trailing 343 * whitespace is removed from the tokens. 344 * @deprecated see the detailed instructions under 345 * {@link #split(String, String, boolean)} 346 */ 347 @Deprecated splitAndTrim(String str, String delims)348 public static String[] splitAndTrim(String str, String delims) { 349 return split(str, delims, true); 350 } 351 352 /** Parse comma-separated list of ints and return as array. */ splitInts(String str)353 public static int[] splitInts(String str) throws IllegalArgumentException { 354 StringTokenizer tokenizer = new StringTokenizer(str, ","); 355 int n = tokenizer.countTokens(); 356 int[] list = new int[n]; 357 for (int i = 0; i < n; i++) { 358 String token = tokenizer.nextToken(); 359 list[i] = Integer.parseInt(token); 360 } 361 return list; 362 } 363 364 /** Parse comma-separated list of longs and return as array. */ splitLongs(String str)365 public static long[] splitLongs(String str) throws IllegalArgumentException { 366 StringTokenizer tokenizer = new StringTokenizer(str, ","); 367 int n = tokenizer.countTokens(); 368 long[] list = new long[n]; 369 for (int i = 0; i < n; i++) { 370 String token = tokenizer.nextToken(); 371 list[i] = Long.parseLong(token); 372 } 373 return list; 374 } 375 376 /** This replaces the occurrences of 'what' in 'str' with 'with' 377 * 378 * @param str the string to process 379 * @param what to replace 380 * @param with replace with this 381 * @return String str where 'what' was replaced with 'with' 382 * 383 * @deprecated Please use {@link String#replace(CharSequence, CharSequence)}. 384 */ 385 @Deprecated replace( String str, CharSequence what, CharSequence with)386 public static String replace( 387 String str, CharSequence what, CharSequence with) { 388 // Have to check this argument, for compatibility with the old impl. 389 // For the record, String.replace() is capable of handling an empty target 390 // string... but it does something kind of weird in that case. 391 checkArgument(what.length() > 0); 392 return str.replace(what, with); 393 } 394 395 private static final Splitter NEWLINE_SPLITTER = 396 Splitter.on('\n').omitEmptyStrings(); 397 398 /** 399 * Reformats the given string to a fixed width by inserting carriage returns 400 * and trimming unnecessary whitespace. See 401 * {@link #fixedWidth(String[], int)} for details. The {@code str} argument 402 * to this method will be split on newline characters ({@code '\n'}) only 403 * (regardless of platform). An array of resulting non-empty strings is 404 * then passed to {@link #fixedWidth(String[], int)} as the {@code lines} 405 * parameter. 406 * 407 * @param str the string to format 408 * @param width the fixed width (in characters) 409 */ fixedWidth(String str, int width)410 public static String fixedWidth(String str, int width) { 411 List<String> lines = new ArrayList<String>(); 412 413 for (String line : NEWLINE_SPLITTER.split(str)) { 414 lines.add(line); 415 } 416 417 String[] lineArray = lines.toArray(new String[0]); 418 return fixedWidth(lineArray, width); 419 } 420 421 /** 422 * Reformats the given array of lines to a fixed width by inserting 423 * newlines and trimming unnecessary whitespace. This uses simple 424 * whitespace-based splitting, not sophisticated internationalized 425 * line breaking. Newlines within a line are treated like any other 426 * whitespace. Lines which are already short enough will be passed 427 * through unmodified. 428 * 429 * <p>Only breaking whitespace characters (those which match 430 * {@link CharMatcher#BREAKING_WHITESPACE}) are treated as whitespace by 431 * this method. Non-breaking whitespace characters will be considered as 432 * ordinary characters which are connected to any other adjacent 433 * non-whitespace characters, and will therefore appear in the returned 434 * string in their original context. 435 * 436 * @param lines array of lines to format 437 * @param width the fixed width (in characters) 438 */ fixedWidth(String[] lines, int width)439 public static String fixedWidth(String[] lines, int width) { 440 List<String> formattedLines = new ArrayList<String>(); 441 442 for (String line : lines) { 443 formattedLines.add(formatLineToFixedWidth(line, width)); 444 } 445 446 return Joiner.on('\n').join(formattedLines); 447 } 448 449 private static final Splitter TO_WORDS = 450 Splitter.on(CharMatcher.BREAKING_WHITESPACE).omitEmptyStrings(); 451 452 /** 453 * Helper method for {@link #fixedWidth(String[], int)} 454 */ formatLineToFixedWidth(String line, int width)455 private static String formatLineToFixedWidth(String line, int width) { 456 if (line.length() <= width) { 457 return line; 458 } 459 460 StringBuilder builder = new StringBuilder(); 461 int col = 0; 462 463 for (String word : TO_WORDS.split(line)) { 464 if (col == 0) { 465 col = word.length(); 466 } else { 467 int newCol = col + word.length() + 1; // +1 for the space 468 469 if (newCol <= width) { 470 builder.append(' '); 471 col = newCol; 472 } else { 473 builder.append('\n'); 474 col = word.length(); 475 } 476 } 477 478 builder.append(word); 479 } 480 481 return builder.toString(); 482 } 483 484 /** 485 * Splits the argument original into a list of substrings. All the 486 * substrings in the returned list (except possibly the last) will 487 * have length lineLen. 488 * 489 * @param lineLen the length of the substrings to put in the list 490 * @param original the original string 491 * 492 * @return a list of strings of length lineLen that together make up the 493 * original string 494 * @deprecated use {@code Splitter.fixedLength(lineLen).split(original))} 495 * (note that it returns an {@code Iterable}, not a {@code List}) 496 */ 497 @Deprecated fixedSplit(String original, int lineLen)498 public static List<String> fixedSplit(String original, int lineLen) { 499 List<String> output = new ArrayList<String>(); 500 for (String elem : Splitter.fixedLength(lineLen).split(original)) { 501 output.add(elem); 502 } 503 return output; 504 } 505 506 /** 507 * Indents the given String per line. 508 * @param iString the string to indent 509 * @param iIndentDepth the depth of the indentation 510 * @return the indented string 511 */ indent(String iString, int iIndentDepth)512 public static String indent(String iString, int iIndentDepth) { 513 StringBuilder spacer = new StringBuilder(); 514 spacer.append("\n"); 515 for (int i = 0; i < iIndentDepth; i++) { 516 spacer.append(" "); 517 } 518 return iString.replace("\n", spacer.toString()); 519 } 520 521 /** 522 * This is a both way strip. 523 * 524 * @param str the string to strip 525 * @param left strip from left 526 * @param right strip from right 527 * @param what character(s) to strip 528 * @return the stripped string 529 * @deprecated ensure the string is not null and use 530 * <ul> 531 * <li> {@code CharMatcher.anyOf(what).trimFrom(str)} 532 * if {@code left == true} and {@code right == true} 533 * <li> {@code CharMatcher.anyOf(what).trimLeadingFrom(str)} 534 * if {@code left == true} and {@code right == false} 535 * <li> {@code CharMatcher.anyOf(what).trimTrailingFrom(str)} 536 * if {@code left == false} and {@code right == true} 537 * </ul> 538 */ 539 @Deprecated megastrip(String str, boolean left, boolean right, String what)540 public static String megastrip(String str, 541 boolean left, boolean right, 542 String what) { 543 if (str == null) { 544 return null; 545 } 546 547 CharMatcher matcher = CharMatcher.anyOf(what); 548 if (left) { 549 if (right) { 550 return matcher.trimFrom(str); 551 } 552 return matcher.trimLeadingFrom(str); 553 } 554 if (right) { 555 return matcher.trimTrailingFrom(str); 556 } 557 return str; 558 } 559 560 /** strip - strips both ways 561 * 562 * @param str what to strip 563 * @return String the striped string 564 * @deprecated ensure the string is not null and use {@code 565 * CharMatcher.LEGACY_WHITESPACE.trimFrom(str)}; also consider whether you 566 * really want the legacy whitespace definition, or something more 567 * standard like {@link CharMatcher#WHITESPACE}. 568 */ 569 @SuppressWarnings("deprecation") // this is deprecated itself strip(String str)570 @Deprecated public static String strip(String str) { 571 return (str == null) ? null : CharMatcher.LEGACY_WHITESPACE.trimFrom(str); 572 } 573 574 /** Strip white spaces from both end, and collapse white spaces 575 * in the middle. 576 * 577 * @param str what to strip 578 * @return String the striped and collapsed string 579 * @deprecated ensure the string is not null and use {@code 580 * CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ')}; also 581 * consider whether you really want the legacy whitespace definition, or 582 * something more standard like {@link CharMatcher#WHITESPACE}. 583 */ 584 @SuppressWarnings("deprecation") // this is deprecated itself stripAndCollapse(String str)585 @Deprecated public static String stripAndCollapse(String str) { 586 return (str == null) ? null 587 : CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' '); 588 } 589 590 /** 591 * Give me a string and a potential prefix, and I return the string 592 * following the prefix if the prefix matches, else null. 593 * Analogous to the c++ functions strprefix and var_strprefix. 594 * 595 * @param str the string to strip 596 * @param prefix the expected prefix 597 * @return the stripped string or <code>null</code> if the string 598 * does not start with the prefix 599 */ stripPrefix(String str, String prefix)600 public static String stripPrefix(String str, String prefix) { 601 return str.startsWith(prefix) 602 ? str.substring(prefix.length()) 603 : null; 604 } 605 606 /** 607 * Case insensitive version of stripPrefix. Strings are compared in 608 * the same way as in {@link String#equalsIgnoreCase}. 609 * Analogous to the c++ functions strcaseprefix and var_strcaseprefix. 610 * 611 * @param str the string to strip 612 * @param prefix the expected prefix 613 * @return the stripped string or <code>null</code> if the string 614 * does not start with the prefix 615 */ stripPrefixIgnoreCase(String str, String prefix)616 public static String stripPrefixIgnoreCase(String str, String prefix) { 617 return startsWithIgnoreCase(str, prefix) 618 ? str.substring(prefix.length()) 619 : null; 620 } 621 622 /** 623 * Give me a string and a potential suffix, and I return the string 624 * before the suffix if the suffix matches, else null. 625 * Analogous to the c++ function strsuffix. 626 * 627 * @param str the string to strip 628 * @param suffix the expected suffix 629 * @return the stripped string or <code>null</code> if the string 630 * does not end with the suffix 631 */ stripSuffix(String str, String suffix)632 public static String stripSuffix(String str, String suffix) { 633 return str.endsWith(suffix) 634 ? str.substring(0, str.length() - suffix.length()) 635 : null; 636 } 637 638 /** 639 * Case insensitive version of stripSuffix. Strings are compared in 640 * the same way as in {@link String#equalsIgnoreCase}. 641 * Analogous to the c++ function strcasesuffix. 642 * 643 * @param str the string to strip 644 * @param suffix the expected suffix 645 * @return the stripped string or <code>null</code> if the string 646 * does not end with the suffix 647 */ stripSuffixIgnoreCase( String str, String suffix)648 public static String stripSuffixIgnoreCase( 649 String str, String suffix) { 650 return endsWithIgnoreCase(str, suffix) 651 ? str.substring(0, str.length() - suffix.length()) 652 : null; 653 } 654 655 /** 656 * Strips all non-digit characters from a string. 657 * 658 * The resulting string will only contain characters for which isDigit() 659 * returns true. 660 * 661 * @param str the string to strip 662 * @return a string consisting of digits only, or an empty string 663 * @deprecated use {@code CharMatcher.JAVA_DIGIT.retainFrom(str)} (also 664 * consider whether this is really the definition of "digit" you wish to 665 * use) 666 */ stripNonDigits(String str)667 @Deprecated public static String stripNonDigits(String str) { 668 return CharMatcher.JAVA_DIGIT.retainFrom(str); 669 } 670 671 /** 672 * Finds the last index in str of a character not in the characters 673 * in 'chars' (similar to ANSI string.find_last_not_of). 674 * 675 * Returns -1 if no such character can be found. 676 * 677 * <p><b>Note:</b> If {@code fromIndex} is zero, use {@link CharMatcher} 678 * instead for this: {@code CharMatcher.noneOf(chars).lastIndexIn(str)}. 679 */ 680 // TODO(kevinb): after adding fromIndex versions of (last)IndexOf to 681 // CharMatcher, deprecate this lastIndexNotOf(String str, String chars, int fromIndex)682 public static int lastIndexNotOf(String str, String chars, int fromIndex) { 683 fromIndex = Math.min(fromIndex, str.length() - 1); 684 685 for (int pos = fromIndex; pos >= 0; pos--) { 686 if (chars.indexOf(str.charAt(pos)) < 0) { 687 return pos; 688 } 689 } 690 691 return -1; 692 } 693 694 /** 695 * Like String.replace() except that it accepts any number of old chars. 696 * Replaces any occurrances of 'oldchars' in 'str' with 'newchar'. 697 * Example: replaceChars("Hello, world!", "H,!", ' ') returns " ello world " 698 * 699 * @deprecated use {@code CharMatcher#replaceFrom(String, char)}, for example 700 * {@code CharMatcher.anyOf(oldchars).replaceFrom(str, newchar)} 701 */ replaceChars( String str, CharSequence oldchars, char newchar)702 @Deprecated public static String replaceChars( 703 String str, CharSequence oldchars, char newchar) { 704 return CharMatcher.anyOf(oldchars).replaceFrom(str, newchar); 705 } 706 707 /** 708 * Remove any occurrances of 'oldchars' in 'str'. 709 * Example: removeChars("Hello, world!", ",!") returns "Hello world" 710 * 711 * @deprecated use {@link CharMatcher#removeFrom(CharSequence)}, for example 712 * {@code CharMatcher.anyOf(oldchars).removeFrom(str)} 713 */ removeChars( String str, CharSequence oldchars)714 @Deprecated public static String removeChars( 715 String str, CharSequence oldchars) { 716 return CharMatcher.anyOf(oldchars).removeFrom(str); 717 } 718 719 // See http://www.microsoft.com/typography/unicode/1252.htm 720 private static final CharMatcher FANCY_SINGLE_QUOTE 721 = CharMatcher.anyOf("\u0091\u0092\u2018\u2019"); 722 private static final CharMatcher FANCY_DOUBLE_QUOTE 723 = CharMatcher.anyOf("\u0093\u0094\u201c\u201d"); 724 725 /** 726 * Replaces microsoft "smart quotes" (curly " and ') with their 727 * ascii counterparts. 728 */ replaceSmartQuotes(String str)729 public static String replaceSmartQuotes(String str) { 730 String tmp = FANCY_SINGLE_QUOTE.replaceFrom(str, '\''); 731 return FANCY_DOUBLE_QUOTE.replaceFrom(tmp, '"'); 732 } 733 734 /** 735 * Convert a string of hex digits to a byte array, with the first 736 * byte in the array being the MSB. The string passed in should be 737 * just the raw digits (upper or lower case), with no leading 738 * or trailing characters (like '0x' or 'h'). 739 * An odd number of characters is supported. 740 * If the string is empty, an empty array will be returned. 741 * 742 * This is significantly faster than using 743 * new BigInteger(str, 16).toByteArray(); 744 * especially with larger strings. Here are the results of some 745 * microbenchmarks done on a P4 2.8GHz 2GB RAM running 746 * linux 2.4.22-gg11 and JDK 1.5 with an optimized build: 747 * 748 * String length hexToBytes (usec) BigInteger 749 * ----------------------------------------------------- 750 * 16 0.570 1.43 751 * 256 8.21 44.4 752 * 1024 32.8 526 753 * 16384 546 121000 754 */ hexToBytes(CharSequence str)755 public static byte[] hexToBytes(CharSequence str) { 756 byte[] bytes = new byte[(str.length() + 1) / 2]; 757 if (str.length() == 0) { 758 return bytes; 759 } 760 bytes[0] = 0; 761 int nibbleIdx = (str.length() % 2); 762 for (int i = 0; i < str.length(); i++) { 763 char c = str.charAt(i); 764 if (!isHex(c)) { 765 throw new IllegalArgumentException("string contains non-hex chars"); 766 } 767 if ((nibbleIdx % 2) == 0) { 768 bytes[nibbleIdx >> 1] = (byte) (hexValue(c) << 4); 769 } else { 770 bytes[nibbleIdx >> 1] += (byte) hexValue(c); 771 } 772 nibbleIdx++; 773 } 774 return bytes; 775 } 776 777 /** 778 * Converts any instances of "\r" or "\r\n" style EOLs into "\n" (Line Feed). 779 */ convertEOLToLF(String input)780 public static String convertEOLToLF(String input) { 781 StringBuilder res = new StringBuilder(input.length()); 782 char[] s = input.toCharArray(); 783 int from = 0; 784 final int end = s.length; 785 for (int i = 0; i < end; i++) { 786 if (s[i] == '\r') { 787 res.append(s, from, i - from); 788 res.append('\n'); 789 if (i + 1 < end && s[i + 1] == '\n') { 790 i++; 791 } 792 793 from = i + 1; 794 } 795 } 796 797 if (from == 0) { // no \r! 798 return input; 799 } 800 801 res.append(s, from, end - from); 802 return res.toString(); 803 } 804 805 /** 806 * Old location of {@link Strings#padStart}; this method will be deprecated 807 * soon. 808 */ padLeft(String s, int len, char padChar)809 public static String padLeft(String s, int len, char padChar) { 810 return Strings.padStart(s, len, padChar); 811 } 812 813 /** 814 * Old location of {@link Strings#padEnd}; this method will be deprecated 815 * soon. 816 */ padRight(String s, int len, char padChar)817 public static String padRight(String s, int len, char padChar) { 818 return Strings.padEnd(s, len, padChar); 819 } 820 821 /** 822 * Returns a string consisting of "s", with each of the first "len" characters 823 * replaced by "maskChar" character. 824 */ maskLeft(String s, int len, char maskChar)825 public static String maskLeft(String s, int len, char maskChar) { 826 if (len <= 0) { 827 return s; 828 } 829 len = Math.min(len, s.length()); 830 StringBuilder sb = new StringBuilder(); 831 for (int i = 0; i < len; i++) { 832 sb.append(maskChar); 833 } 834 sb.append(s.substring(len)); 835 return sb.toString(); 836 } 837 isOctal(char c)838 private static boolean isOctal(char c) { 839 return (c >= '0') && (c <= '7'); 840 } 841 isHex(char c)842 private static boolean isHex(char c) { 843 return ((c >= '0') && (c <= '9')) || 844 ((c >= 'a') && (c <= 'f')) || 845 ((c >= 'A') && (c <= 'F')); 846 } 847 hexValue(char c)848 private static int hexValue(char c) { 849 if ((c >= '0') && (c <= '9')) { 850 return (c - '0'); 851 } else if ((c >= 'a') && (c <= 'f')) { 852 return (c - 'a') + 10; 853 } else { 854 return (c - 'A') + 10; 855 } 856 } 857 858 /** 859 * Unescape any C escape sequences (\n, \r, \\, \ooo, etc) and return the 860 * resulting string. 861 */ unescapeCString(String s)862 public static String unescapeCString(String s) { 863 if (s.indexOf('\\') < 0) { 864 // Fast path: nothing to unescape 865 return s; 866 } 867 868 StringBuilder sb = new StringBuilder(); 869 int len = s.length(); 870 for (int i = 0; i < len;) { 871 char c = s.charAt(i++); 872 if (c == '\\' && (i < len)) { 873 c = s.charAt(i++); 874 switch (c) { 875 case 'a': c = '\007'; break; 876 case 'b': c = '\b'; break; 877 case 'f': c = '\f'; break; 878 case 'n': c = '\n'; break; 879 case 'r': c = '\r'; break; 880 case 't': c = '\t'; break; 881 case 'v': c = '\013'; break; 882 case '\\': c = '\\'; break; 883 case '?': c = '?'; break; 884 case '\'': c = '\''; break; 885 case '"': c = '\"'; break; 886 887 default: { 888 if ((c == 'x') && (i < len) && isHex(s.charAt(i))) { 889 // "\xXX" 890 int v = hexValue(s.charAt(i++)); 891 if ((i < len) && isHex(s.charAt(i))) { 892 v = v * 16 + hexValue(s.charAt(i++)); 893 } 894 c = (char) v; 895 } else if (isOctal(c)) { 896 // "\OOO" 897 int v = (c - '0'); 898 if ((i < len) && isOctal(s.charAt(i))) { 899 v = v * 8 + (s.charAt(i++) - '0'); 900 } 901 if ((i < len) && isOctal(s.charAt(i))) { 902 v = v * 8 + (s.charAt(i++) - '0'); 903 } 904 c = (char) v; 905 } else { 906 // Propagate unknown escape sequences. 907 sb.append('\\'); 908 } 909 break; 910 } 911 } 912 } 913 sb.append(c); 914 } 915 return sb.toString(); 916 } 917 918 /** 919 * Unescape any MySQL escape sequences. 920 * See MySQL language reference Chapter 6 at 921 * <a href="http://www.mysql.com/doc/">http://www.mysql.com/doc/</a>. 922 * This function will <strong>not</strong> work for other SQL-like 923 * dialects. 924 * @param s string to unescape, with the surrounding quotes. 925 * @return unescaped string, without the surrounding quotes. 926 * @exception IllegalArgumentException if s is not a valid MySQL string. 927 */ unescapeMySQLString(String s)928 public static String unescapeMySQLString(String s) 929 throws IllegalArgumentException { 930 // note: the same buffer is used for both reading and writing 931 // it works because the writer can never outrun the reader 932 char chars[] = s.toCharArray(); 933 934 // the string must be quoted 'like this' or "like this" 935 if (chars.length < 2 || chars[0] != chars[chars.length - 1] || 936 (chars[0] != '\'' && chars[0] != '"')) { 937 throw new IllegalArgumentException("not a valid MySQL string: " + s); 938 } 939 940 // parse the string and decode the backslash sequences; in addition, 941 // quotes can be escaped 'like this: ''', "like this: """, or 'like this: "' 942 int j = 1; // write position in the string (never exceeds read position) 943 int f = 0; // state: 0 (normal), 1 (backslash), 2 (quote) 944 for (int i = 1; i < chars.length - 1; i++) { 945 if (f == 0) { // previous character was normal 946 if (chars[i] == '\\') { 947 f = 1; // backslash 948 } else if (chars[i] == chars[0]) { 949 f = 2; // quoting character 950 } else { 951 chars[j++] = chars[i]; 952 } 953 } else if (f == 1) { // previous character was a backslash 954 switch (chars[i]) { 955 case '0': chars[j++] = '\0'; break; 956 case '\'': chars[j++] = '\''; break; 957 case '"': chars[j++] = '"'; break; 958 case 'b': chars[j++] = '\b'; break; 959 case 'n': chars[j++] = '\n'; break; 960 case 'r': chars[j++] = '\r'; break; 961 case 't': chars[j++] = '\t'; break; 962 case 'z': chars[j++] = '\032'; break; 963 case '\\': chars[j++] = '\\'; break; 964 default: 965 // if the character is not special, backslash disappears 966 chars[j++] = chars[i]; 967 break; 968 } 969 f = 0; 970 } else { // previous character was a quote 971 // quoting characters must be doubled inside a string 972 if (chars[i] != chars[0]) { 973 throw new IllegalArgumentException("not a valid MySQL string: " + s); 974 } 975 chars[j++] = chars[0]; 976 f = 0; 977 } 978 } 979 // string contents cannot end with a special character 980 if (f != 0) { 981 throw new IllegalArgumentException("not a valid MySQL string: " + s); 982 } 983 984 // done 985 return new String(chars, 1, j - 1); 986 } 987 988 // TODO(pbarry): move all HTML methods to common.html package 989 990 static final Map<String, Character> ESCAPE_STRINGS; 991 static final Set<Character> HEX_LETTERS; 992 993 static { 994 // HTML character entity references as defined in HTML 4 995 // see http://www.w3.org/TR/REC-html40/sgml/entities.html 996 ESCAPE_STRINGS = new HashMap<String, Character>(252); 997 998 ESCAPE_STRINGS.put(" ", '\u00A0'); 999 ESCAPE_STRINGS.put("¡", '\u00A1'); 1000 ESCAPE_STRINGS.put("¢", '\u00A2'); 1001 ESCAPE_STRINGS.put("£", '\u00A3'); 1002 ESCAPE_STRINGS.put("¤", '\u00A4'); 1003 ESCAPE_STRINGS.put("¥", '\u00A5'); 1004 ESCAPE_STRINGS.put("¦", '\u00A6'); 1005 ESCAPE_STRINGS.put("§", '\u00A7'); 1006 ESCAPE_STRINGS.put("¨", '\u00A8'); 1007 ESCAPE_STRINGS.put("©", '\u00A9'); 1008 ESCAPE_STRINGS.put("ª", '\u00AA'); 1009 ESCAPE_STRINGS.put("«", '\u00AB'); 1010 ESCAPE_STRINGS.put("¬", '\u00AC'); 1011 ESCAPE_STRINGS.put("­", '\u00AD'); 1012 ESCAPE_STRINGS.put("®", '\u00AE'); 1013 ESCAPE_STRINGS.put("¯", '\u00AF'); 1014 ESCAPE_STRINGS.put("°", '\u00B0'); 1015 ESCAPE_STRINGS.put("±", '\u00B1'); 1016 ESCAPE_STRINGS.put("²", '\u00B2'); 1017 ESCAPE_STRINGS.put("³", '\u00B3'); 1018 ESCAPE_STRINGS.put("´", '\u00B4'); 1019 ESCAPE_STRINGS.put("µ", '\u00B5'); 1020 ESCAPE_STRINGS.put("¶", '\u00B6'); 1021 ESCAPE_STRINGS.put("·", '\u00B7'); 1022 ESCAPE_STRINGS.put("¸", '\u00B8'); 1023 ESCAPE_STRINGS.put("¹", '\u00B9'); 1024 ESCAPE_STRINGS.put("º", '\u00BA'); 1025 ESCAPE_STRINGS.put("»", '\u00BB'); 1026 ESCAPE_STRINGS.put("¼", '\u00BC'); 1027 ESCAPE_STRINGS.put("½", '\u00BD'); 1028 ESCAPE_STRINGS.put("¾", '\u00BE'); 1029 ESCAPE_STRINGS.put("¿", '\u00BF'); 1030 ESCAPE_STRINGS.put("À", '\u00C0'); 1031 ESCAPE_STRINGS.put("Á", '\u00C1'); 1032 ESCAPE_STRINGS.put("Â", '\u00C2'); 1033 ESCAPE_STRINGS.put("Ã", '\u00C3'); 1034 ESCAPE_STRINGS.put("Ä", '\u00C4'); 1035 ESCAPE_STRINGS.put("Å", '\u00C5'); 1036 ESCAPE_STRINGS.put("Æ", '\u00C6'); 1037 ESCAPE_STRINGS.put("Ç", '\u00C7'); 1038 ESCAPE_STRINGS.put("È", '\u00C8'); 1039 ESCAPE_STRINGS.put("É", '\u00C9'); 1040 ESCAPE_STRINGS.put("Ê", '\u00CA'); 1041 ESCAPE_STRINGS.put("Ë", '\u00CB'); 1042 ESCAPE_STRINGS.put("Ì", '\u00CC'); 1043 ESCAPE_STRINGS.put("Í", '\u00CD'); 1044 ESCAPE_STRINGS.put("Î", '\u00CE'); 1045 ESCAPE_STRINGS.put("Ï", '\u00CF'); 1046 ESCAPE_STRINGS.put("Ð", '\u00D0'); 1047 ESCAPE_STRINGS.put("Ñ", '\u00D1'); 1048 ESCAPE_STRINGS.put("Ò", '\u00D2'); 1049 ESCAPE_STRINGS.put("Ó", '\u00D3'); 1050 ESCAPE_STRINGS.put("Ô", '\u00D4'); 1051 ESCAPE_STRINGS.put("Õ", '\u00D5'); 1052 ESCAPE_STRINGS.put("Ö", '\u00D6'); 1053 ESCAPE_STRINGS.put("×", '\u00D7'); 1054 ESCAPE_STRINGS.put("Ø", '\u00D8'); 1055 ESCAPE_STRINGS.put("Ù", '\u00D9'); 1056 ESCAPE_STRINGS.put("Ú", '\u00DA'); 1057 ESCAPE_STRINGS.put("Û", '\u00DB'); 1058 ESCAPE_STRINGS.put("Ü", '\u00DC'); 1059 ESCAPE_STRINGS.put("Ý", '\u00DD'); 1060 ESCAPE_STRINGS.put("Þ", '\u00DE'); 1061 ESCAPE_STRINGS.put("ß", '\u00DF'); 1062 ESCAPE_STRINGS.put("à", '\u00E0'); 1063 ESCAPE_STRINGS.put("á", '\u00E1'); 1064 ESCAPE_STRINGS.put("â", '\u00E2'); 1065 ESCAPE_STRINGS.put("ã", '\u00E3'); 1066 ESCAPE_STRINGS.put("ä", '\u00E4'); 1067 ESCAPE_STRINGS.put("å", '\u00E5'); 1068 ESCAPE_STRINGS.put("æ", '\u00E6'); 1069 ESCAPE_STRINGS.put("ç", '\u00E7'); 1070 ESCAPE_STRINGS.put("è", '\u00E8'); 1071 ESCAPE_STRINGS.put("é", '\u00E9'); 1072 ESCAPE_STRINGS.put("ê", '\u00EA'); 1073 ESCAPE_STRINGS.put("ë", '\u00EB'); 1074 ESCAPE_STRINGS.put("ì", '\u00EC'); 1075 ESCAPE_STRINGS.put("í", '\u00ED'); 1076 ESCAPE_STRINGS.put("î", '\u00EE'); 1077 ESCAPE_STRINGS.put("ï", '\u00EF'); 1078 ESCAPE_STRINGS.put("ð", '\u00F0'); 1079 ESCAPE_STRINGS.put("ñ", '\u00F1'); 1080 ESCAPE_STRINGS.put("ò", '\u00F2'); 1081 ESCAPE_STRINGS.put("ó", '\u00F3'); 1082 ESCAPE_STRINGS.put("ô", '\u00F4'); 1083 ESCAPE_STRINGS.put("õ", '\u00F5'); 1084 ESCAPE_STRINGS.put("ö", '\u00F6'); 1085 ESCAPE_STRINGS.put("÷", '\u00F7'); 1086 ESCAPE_STRINGS.put("ø", '\u00F8'); 1087 ESCAPE_STRINGS.put("ù", '\u00F9'); 1088 ESCAPE_STRINGS.put("ú", '\u00FA'); 1089 ESCAPE_STRINGS.put("û", '\u00FB'); 1090 ESCAPE_STRINGS.put("ü", '\u00FC'); 1091 ESCAPE_STRINGS.put("ý", '\u00FD'); 1092 ESCAPE_STRINGS.put("þ", '\u00FE'); 1093 ESCAPE_STRINGS.put("ÿ", '\u00FF'); 1094 ESCAPE_STRINGS.put("&fnof", '\u0192'); 1095 ESCAPE_STRINGS.put("&Alpha", '\u0391'); 1096 ESCAPE_STRINGS.put("&Beta", '\u0392'); 1097 ESCAPE_STRINGS.put("&Gamma", '\u0393'); 1098 ESCAPE_STRINGS.put("&Delta", '\u0394'); 1099 ESCAPE_STRINGS.put("&Epsilon", '\u0395'); 1100 ESCAPE_STRINGS.put("&Zeta", '\u0396'); 1101 ESCAPE_STRINGS.put("&Eta", '\u0397'); 1102 ESCAPE_STRINGS.put("&Theta", '\u0398'); 1103 ESCAPE_STRINGS.put("&Iota", '\u0399'); 1104 ESCAPE_STRINGS.put("&Kappa", '\u039A'); 1105 ESCAPE_STRINGS.put("&Lambda", '\u039B'); 1106 ESCAPE_STRINGS.put("&Mu", '\u039C'); 1107 ESCAPE_STRINGS.put("&Nu", '\u039D'); 1108 ESCAPE_STRINGS.put("&Xi", '\u039E'); 1109 ESCAPE_STRINGS.put("&Omicron", '\u039F'); 1110 ESCAPE_STRINGS.put("&Pi", '\u03A0'); 1111 ESCAPE_STRINGS.put("&Rho", '\u03A1'); 1112 ESCAPE_STRINGS.put("&Sigma", '\u03A3'); 1113 ESCAPE_STRINGS.put("&Tau", '\u03A4'); 1114 ESCAPE_STRINGS.put("&Upsilon", '\u03A5'); 1115 ESCAPE_STRINGS.put("&Phi", '\u03A6'); 1116 ESCAPE_STRINGS.put("&Chi", '\u03A7'); 1117 ESCAPE_STRINGS.put("&Psi", '\u03A8'); 1118 ESCAPE_STRINGS.put("&Omega", '\u03A9'); 1119 ESCAPE_STRINGS.put("&alpha", '\u03B1'); 1120 ESCAPE_STRINGS.put("&beta", '\u03B2'); 1121 ESCAPE_STRINGS.put("&gamma", '\u03B3'); 1122 ESCAPE_STRINGS.put("&delta", '\u03B4'); 1123 ESCAPE_STRINGS.put("&epsilon", '\u03B5'); 1124 ESCAPE_STRINGS.put("&zeta", '\u03B6'); 1125 ESCAPE_STRINGS.put("&eta", '\u03B7'); 1126 ESCAPE_STRINGS.put("&theta", '\u03B8'); 1127 ESCAPE_STRINGS.put("&iota", '\u03B9'); 1128 ESCAPE_STRINGS.put("&kappa", '\u03BA'); 1129 ESCAPE_STRINGS.put("&lambda", '\u03BB'); 1130 ESCAPE_STRINGS.put("&mu", '\u03BC'); 1131 ESCAPE_STRINGS.put("&nu", '\u03BD'); 1132 ESCAPE_STRINGS.put("&xi", '\u03BE'); 1133 ESCAPE_STRINGS.put("&omicron", '\u03BF'); 1134 ESCAPE_STRINGS.put("&pi", '\u03C0'); 1135 ESCAPE_STRINGS.put("&rho", '\u03C1'); 1136 ESCAPE_STRINGS.put("&sigmaf", '\u03C2'); 1137 ESCAPE_STRINGS.put("&sigma", '\u03C3'); 1138 ESCAPE_STRINGS.put("&tau", '\u03C4'); 1139 ESCAPE_STRINGS.put("&upsilon", '\u03C5'); 1140 ESCAPE_STRINGS.put("&phi", '\u03C6'); 1141 ESCAPE_STRINGS.put("&chi", '\u03C7'); 1142 ESCAPE_STRINGS.put("&psi", '\u03C8'); 1143 ESCAPE_STRINGS.put("&omega", '\u03C9'); 1144 ESCAPE_STRINGS.put("&thetasym", '\u03D1'); 1145 ESCAPE_STRINGS.put("&upsih", '\u03D2'); 1146 ESCAPE_STRINGS.put("&piv", '\u03D6'); 1147 ESCAPE_STRINGS.put("&bull", '\u2022'); 1148 ESCAPE_STRINGS.put("&hellip", '\u2026'); 1149 ESCAPE_STRINGS.put("&prime", '\u2032'); 1150 ESCAPE_STRINGS.put("&Prime", '\u2033'); 1151 ESCAPE_STRINGS.put("&oline", '\u203E'); 1152 ESCAPE_STRINGS.put("&frasl", '\u2044'); 1153 ESCAPE_STRINGS.put("&weierp", '\u2118'); 1154 ESCAPE_STRINGS.put("&image", '\u2111'); 1155 ESCAPE_STRINGS.put("&real", '\u211C'); 1156 ESCAPE_STRINGS.put("&trade", '\u2122'); 1157 ESCAPE_STRINGS.put("&alefsym", '\u2135'); 1158 ESCAPE_STRINGS.put("&larr", '\u2190'); 1159 ESCAPE_STRINGS.put("&uarr", '\u2191'); 1160 ESCAPE_STRINGS.put("&rarr", '\u2192'); 1161 ESCAPE_STRINGS.put("&darr", '\u2193'); 1162 ESCAPE_STRINGS.put("&harr", '\u2194'); 1163 ESCAPE_STRINGS.put("&crarr", '\u21B5'); 1164 ESCAPE_STRINGS.put("&lArr", '\u21D0'); 1165 ESCAPE_STRINGS.put("&uArr", '\u21D1'); 1166 ESCAPE_STRINGS.put("&rArr", '\u21D2'); 1167 ESCAPE_STRINGS.put("&dArr", '\u21D3'); 1168 ESCAPE_STRINGS.put("&hArr", '\u21D4'); 1169 ESCAPE_STRINGS.put("&forall", '\u2200'); 1170 ESCAPE_STRINGS.put("&part", '\u2202'); 1171 ESCAPE_STRINGS.put("&exist", '\u2203'); 1172 ESCAPE_STRINGS.put("&empty", '\u2205'); 1173 ESCAPE_STRINGS.put("&nabla", '\u2207'); 1174 ESCAPE_STRINGS.put("&isin", '\u2208'); 1175 ESCAPE_STRINGS.put("¬in", '\u2209'); 1176 ESCAPE_STRINGS.put("&ni", '\u220B'); 1177 ESCAPE_STRINGS.put("&prod", '\u220F'); 1178 ESCAPE_STRINGS.put("&sum", '\u2211'); 1179 ESCAPE_STRINGS.put("&minus", '\u2212'); 1180 ESCAPE_STRINGS.put("&lowast", '\u2217'); 1181 ESCAPE_STRINGS.put("&radic", '\u221A'); 1182 ESCAPE_STRINGS.put("&prop", '\u221D'); 1183 ESCAPE_STRINGS.put("&infin", '\u221E'); 1184 ESCAPE_STRINGS.put("&ang", '\u2220'); 1185 ESCAPE_STRINGS.put("&and", '\u2227'); 1186 ESCAPE_STRINGS.put("&or", '\u2228'); 1187 ESCAPE_STRINGS.put("&cap", '\u2229'); 1188 ESCAPE_STRINGS.put("&cup", '\u222A'); 1189 ESCAPE_STRINGS.put("&int", '\u222B'); 1190 ESCAPE_STRINGS.put("&there4", '\u2234'); 1191 ESCAPE_STRINGS.put("&sim", '\u223C'); 1192 ESCAPE_STRINGS.put("&cong", '\u2245'); 1193 ESCAPE_STRINGS.put("&asymp", '\u2248'); 1194 ESCAPE_STRINGS.put("&ne", '\u2260'); 1195 ESCAPE_STRINGS.put("&equiv", '\u2261'); 1196 ESCAPE_STRINGS.put("&le", '\u2264'); 1197 ESCAPE_STRINGS.put("&ge", '\u2265'); 1198 ESCAPE_STRINGS.put("&sub", '\u2282'); 1199 ESCAPE_STRINGS.put("&sup", '\u2283'); 1200 ESCAPE_STRINGS.put("&nsub", '\u2284'); 1201 ESCAPE_STRINGS.put("&sube", '\u2286'); 1202 ESCAPE_STRINGS.put("&supe", '\u2287'); 1203 ESCAPE_STRINGS.put("&oplus", '\u2295'); 1204 ESCAPE_STRINGS.put("&otimes", '\u2297'); 1205 ESCAPE_STRINGS.put("&perp", '\u22A5'); 1206 ESCAPE_STRINGS.put("&sdot", '\u22C5'); 1207 ESCAPE_STRINGS.put("&lceil", '\u2308'); 1208 ESCAPE_STRINGS.put("&rceil", '\u2309'); 1209 ESCAPE_STRINGS.put("&lfloor", '\u230A'); 1210 ESCAPE_STRINGS.put("&rfloor", '\u230B'); 1211 ESCAPE_STRINGS.put("&lang", '\u2329'); 1212 ESCAPE_STRINGS.put("&rang", '\u232A'); 1213 ESCAPE_STRINGS.put("&loz", '\u25CA'); 1214 ESCAPE_STRINGS.put("&spades", '\u2660'); 1215 ESCAPE_STRINGS.put("&clubs", '\u2663'); 1216 ESCAPE_STRINGS.put("&hearts", '\u2665'); 1217 ESCAPE_STRINGS.put("&diams", '\u2666'); 1218 ESCAPE_STRINGS.put(""", '\u0022'); 1219 ESCAPE_STRINGS.put("&", '\u0026'); 1220 ESCAPE_STRINGS.put("<", '\u003C'); 1221 ESCAPE_STRINGS.put(">", '\u003E'); 1222 ESCAPE_STRINGS.put("&OElig", '\u0152'); 1223 ESCAPE_STRINGS.put("&oelig", '\u0153'); 1224 ESCAPE_STRINGS.put("&Scaron", '\u0160'); 1225 ESCAPE_STRINGS.put("&scaron", '\u0161'); 1226 ESCAPE_STRINGS.put("&Yuml", '\u0178'); 1227 ESCAPE_STRINGS.put("&circ", '\u02C6'); 1228 ESCAPE_STRINGS.put("&tilde", '\u02DC'); 1229 ESCAPE_STRINGS.put("&ensp", '\u2002'); 1230 ESCAPE_STRINGS.put("&emsp", '\u2003'); 1231 ESCAPE_STRINGS.put("&thinsp", '\u2009'); 1232 ESCAPE_STRINGS.put("&zwnj", '\u200C'); 1233 ESCAPE_STRINGS.put("&zwj", '\u200D'); 1234 ESCAPE_STRINGS.put("&lrm", '\u200E'); 1235 ESCAPE_STRINGS.put("&rlm", '\u200F'); 1236 ESCAPE_STRINGS.put("&ndash", '\u2013'); 1237 ESCAPE_STRINGS.put("&mdash", '\u2014'); 1238 ESCAPE_STRINGS.put("&lsquo", '\u2018'); 1239 ESCAPE_STRINGS.put("&rsquo", '\u2019'); 1240 ESCAPE_STRINGS.put("&sbquo", '\u201A'); 1241 ESCAPE_STRINGS.put("&ldquo", '\u201C'); 1242 ESCAPE_STRINGS.put("&rdquo", '\u201D'); 1243 ESCAPE_STRINGS.put("&bdquo", '\u201E'); 1244 ESCAPE_STRINGS.put("&dagger", '\u2020'); 1245 ESCAPE_STRINGS.put("&Dagger", '\u2021'); 1246 ESCAPE_STRINGS.put("&permil", '\u2030'); 1247 ESCAPE_STRINGS.put("&lsaquo", '\u2039'); 1248 ESCAPE_STRINGS.put("&rsaquo", '\u203A'); 1249 ESCAPE_STRINGS.put("&euro", '\u20AC'); 1250 1251 HEX_LETTERS = new HashSet<Character>(12); 1252 1253 HEX_LETTERS.add('a'); 1254 HEX_LETTERS.add('A'); 1255 HEX_LETTERS.add('b'); 1256 HEX_LETTERS.add('B'); 1257 HEX_LETTERS.add('c'); 1258 HEX_LETTERS.add('C'); 1259 HEX_LETTERS.add('d'); 1260 HEX_LETTERS.add('D'); 1261 HEX_LETTERS.add('e'); 1262 HEX_LETTERS.add('E'); 1263 HEX_LETTERS.add('f'); 1264 HEX_LETTERS.add('F'); 1265 } 1266 1267 /** 1268 * <p> 1269 * Replace all the occurences of HTML escape strings with the 1270 * respective characters. 1271 * </p> 1272 * <p> 1273 * The default mode is strict (requiring semicolons). 1274 * </p> 1275 * 1276 * @param s a <code>String</code> value 1277 * @return a <code>String</code> value 1278 * @throws NullPointerException if the input string is null. 1279 */ unescapeHTML(String s)1280 public static final String unescapeHTML(String s) { 1281 return unescapeHTML(s, false); 1282 } 1283 1284 /** 1285 * Replace all the occurences of HTML escape strings with the 1286 * respective characters. 1287 * 1288 * @param s a <code>String</code> value 1289 * @param emulateBrowsers a <code>Boolean</code> value that tells the method 1290 * to allow entity refs not terminated with a semicolon to be unescaped. 1291 * (a quirk of this feature, and some browsers, is that an explicit 1292 * terminating character is needed - e.g., <$ would be unescaped, but 1293 * not <ab - see the tests for a more in-depth description of browsers) 1294 * @return a <code>String</code> value 1295 * @throws NullPointerException if the input string is null. 1296 */ unescapeHTML(String s, boolean emulateBrowsers)1297 public static final String unescapeHTML(String s, boolean emulateBrowsers) { 1298 1299 // See if there are any '&' in the string since that is what we look 1300 // for to escape. If there isn't, then we don't need to escape this string 1301 // Based on similar technique used in the escape function. 1302 int index = s.indexOf('&'); 1303 if (index == -1) { 1304 // Nothing to escape. Return the original string. 1305 return s; 1306 } 1307 1308 // We found an escaped character. Start slow escaping from there. 1309 char[] chars = s.toCharArray(); 1310 char[] escaped = new char[chars.length]; 1311 System.arraycopy(chars, 0, escaped, 0, index); 1312 1313 // Note: escaped[pos] = end of the escaped char array. 1314 int pos = index; 1315 1316 for (int i = index; i < chars.length;) { 1317 if (chars[i] != '&') { 1318 escaped[pos++] = chars[i++]; 1319 continue; 1320 } 1321 1322 // Allow e.g. { 1323 int j = i + 1; 1324 boolean isNumericEntity = false; 1325 if (j < chars.length && chars[j] == '#') { 1326 j++; 1327 isNumericEntity = true; 1328 } 1329 1330 // if it's numeric, also check for hex 1331 boolean isHexEntity = false; 1332 if (j < chars.length && (chars[j] == 'x' || chars[j] == 'X')) { 1333 j++; 1334 isHexEntity = true; 1335 } 1336 1337 // Scan until we find a char that is not valid for this sequence. 1338 for (; j < chars.length; j++) { 1339 char ch = chars[j]; 1340 boolean isDigit = Character.isDigit(ch); 1341 if (isNumericEntity) { 1342 // non-hex numeric sequence end condition 1343 if (!isHexEntity && !isDigit) { 1344 break; 1345 } 1346 // hex sequence end contition 1347 if (isHexEntity && !isDigit && !HEX_LETTERS.contains(ch)) { 1348 break; 1349 } 1350 } 1351 // anything other than a digit or letter is always an end condition 1352 if (!isDigit && !Character.isLetter(ch)) { 1353 break; 1354 } 1355 } 1356 1357 boolean replaced = false; 1358 if ((j <= chars.length && emulateBrowsers) || 1359 (j < chars.length && chars[j] == ';')) { 1360 // Check for &#D; and 
 pattern 1361 if (i + 2 < chars.length && s.charAt(i + 1) == '#') { 1362 try { 1363 long charcode = 0; 1364 char ch = s.charAt(i + 2); 1365 if (isHexEntity) { 1366 charcode = Long.parseLong( 1367 new String(chars, i + 3, j - i - 3), 16); 1368 } else if (Character.isDigit(ch)) { 1369 charcode = Long.parseLong( 1370 new String(chars, i + 2, j - i - 2)); 1371 } 1372 if (charcode > 0 && charcode < 65536) { 1373 escaped[pos++] = (char) charcode; 1374 replaced = true; 1375 } 1376 } catch (NumberFormatException ex) { 1377 // Failed, not replaced. 1378 } 1379 } else { 1380 String key = new String(chars, i, j - i); 1381 Character repl = ESCAPE_STRINGS.get(key); 1382 if (repl != null) { 1383 escaped[pos++] = repl; 1384 replaced = true; 1385 } 1386 } 1387 // Skip over ';' 1388 if (j < chars.length && chars[j] == ';') { 1389 j++; 1390 } 1391 } 1392 1393 if (!replaced) { 1394 // Not a recognized escape sequence, leave as-is 1395 System.arraycopy(chars, i, escaped, pos, j - i); 1396 pos += j - i; 1397 } 1398 i = j; 1399 } 1400 return new String(escaped, 0, pos); 1401 } 1402 1403 // Escaper for < and > only. 1404 private static final CharEscaper LT_GT_ESCAPE = 1405 new CharEscaperBuilder() 1406 .addEscape('<', "<") 1407 .addEscape('>', ">") 1408 .toEscaper(); 1409 1410 private static final Pattern htmlTagPattern = 1411 Pattern.compile("</?[a-zA-Z][^>]*>"); 1412 1413 /** 1414 * Given a <code>String</code>, returns an equivalent <code>String</code> with 1415 * all HTML tags stripped. Note that HTML entities, such as "&amp;" will 1416 * still be preserved. 1417 */ stripHtmlTags(String string)1418 public static String stripHtmlTags(String string) { 1419 if ((string == null) || "".equals(string)) { 1420 return string; 1421 } 1422 String stripped = htmlTagPattern.matcher(string).replaceAll(""); 1423 /* 1424 * Certain inputs result in a well-formed HTML: 1425 * <<X>script>alert(0)<</X>/script> results in <script>alert(0)</script> 1426 * The following step ensures that no HTML can slip through by replacing all 1427 * < and > characters with < and > after HTML tags were stripped. 1428 */ 1429 return LT_GT_ESCAPE.escape(stripped); 1430 } 1431 1432 /** 1433 * We escape some characters in s to be able to insert strings into JavaScript 1434 * code. Also, make sure that we don't write out {@code -->} or 1435 * {@code </script>}, which may close a script tag, or any char in ["'>] which 1436 * might close a tag or attribute if seen inside an attribute. 1437 */ javaScriptEscape(CharSequence s)1438 public static String javaScriptEscape(CharSequence s) { 1439 return javaScriptEscapeHelper(s, false); 1440 } 1441 1442 /** 1443 * We escape some characters in s to be able to insert strings into JavaScript 1444 * code. Also, make sure that we don't write out {@code -->} or 1445 * {@code </script>}, which may close a script tag, or any char in ["'>] which 1446 * might close a tag or attribute if seen inside an attribute. 1447 * Turns all non-ascii characters into ASCII javascript escape sequences 1448 * (eg \\uhhhh or \ooo). 1449 */ javaScriptEscapeToAscii(CharSequence s)1450 public static String javaScriptEscapeToAscii(CharSequence s) { 1451 return javaScriptEscapeHelper(s, true); 1452 } 1453 1454 /** 1455 * Represents the type of javascript escaping to perform. Each enum below 1456 * determines whether to use octal escapes and how to handle quotes. 1457 */ 1458 public static enum JsEscapingMode { 1459 /** No octal escapes, pass-through ', and escape " as \". */ 1460 JSON, 1461 1462 /** Octal escapes, escapes ' and " to \42 and \47, respectively. */ 1463 EMBEDDABLE_JS, 1464 1465 /** Octal escapes, escapes ' and " to \' and \". */ 1466 MINIMAL_JS 1467 } 1468 1469 /** 1470 * Helper for javaScriptEscape and javaScriptEscapeToAscii 1471 */ javaScriptEscapeHelper(CharSequence s, boolean escapeToAscii)1472 private static String javaScriptEscapeHelper(CharSequence s, 1473 boolean escapeToAscii) { 1474 StringBuilder sb = new StringBuilder(s.length() * 9 / 8); 1475 try { 1476 escapeStringBody(s, escapeToAscii, JsEscapingMode.EMBEDDABLE_JS, sb); 1477 } catch (IOException ex) { 1478 // StringBuilder.append does not throw IOExceptions. 1479 throw new RuntimeException(ex); 1480 } 1481 return sb.toString(); 1482 } 1483 1484 /** 1485 * Appends the javascript string literal equivalent of plainText to the given 1486 * out buffer. 1487 * @param plainText the string to escape. 1488 * @param escapeToAscii true to encode all characters not in ascii [\x20-\x7e] 1489 * <br> 1490 * Full escaping of unicode entites isn't required but this makes 1491 * sure that unicode strings will survive regardless of the 1492 * content-encoding of the javascript file which is important when 1493 * we use this function to autogenerated javascript source files. 1494 * This is disabled by default because it makes non-latin strings very long. 1495 * <br> 1496 * If you seem to have trouble with character-encodings, maybe 1497 * turn this on to see if the problem goes away. If so, you need 1498 * to specify a character encoding for your javascript somewhere. 1499 * @param jsEscapingMode determines the type of escaping to perform. 1500 * @param out the buffer to append output to. 1501 */ 1502 /* 1503 * To avoid fallthrough, we would have to either use a hybrid switch-case/if 1504 * approach (which would obscure our special handling for ' and "), duplicate 1505 * the content of the default case, or pass a half-dozen parameters to a 1506 * helper method containing the code from the default case. 1507 */ 1508 @SuppressWarnings("fallthrough") escapeStringBody( CharSequence plainText, boolean escapeToAscii, JsEscapingMode jsEscapingMode, Appendable out)1509 public static void escapeStringBody( 1510 CharSequence plainText, boolean escapeToAscii, 1511 JsEscapingMode jsEscapingMode, Appendable out) 1512 throws IOException { 1513 int pos = 0; // Index just past the last char in plainText written to out. 1514 int len = plainText.length(); 1515 for (int codePoint, charCount, i = 0; i < len; i += charCount) { 1516 codePoint = Character.codePointAt(plainText, i); 1517 charCount = Character.charCount(codePoint); 1518 1519 if (!shouldEscapeChar(codePoint, escapeToAscii, jsEscapingMode)) { 1520 continue; 1521 } 1522 1523 out.append(plainText, pos, i); 1524 pos = i + charCount; 1525 switch (codePoint) { 1526 case '\b': out.append("\\b"); break; 1527 case '\t': out.append("\\t"); break; 1528 case '\n': out.append("\\n"); break; 1529 case '\f': out.append("\\f"); break; 1530 case '\r': out.append("\\r"); break; 1531 case '\\': out.append("\\\\"); break; 1532 case '"': case '\'': 1533 if (jsEscapingMode == JsEscapingMode.JSON && '\'' == codePoint) { 1534 // JSON does not escape a single quote (and it should be surrounded 1535 // by double quotes). 1536 out.append((char) codePoint); 1537 break; 1538 } else if (jsEscapingMode != JsEscapingMode.EMBEDDABLE_JS) { 1539 out.append('\\').append((char) codePoint); 1540 break; 1541 } 1542 // fall through 1543 default: 1544 if (codePoint >= 0x100 || jsEscapingMode == JsEscapingMode.JSON) { 1545 appendHexJavaScriptRepresentation(codePoint, out); 1546 } else { 1547 // Output the minimal octal encoding. We can't use an encoding 1548 // shorter than three digits if the next digit is a valid octal 1549 // digit. 1550 boolean pad = i + charCount >= len 1551 || isOctal(plainText.charAt(i + charCount)); 1552 appendOctalJavaScriptRepresentation((char) codePoint, pad, out); 1553 } 1554 break; 1555 } 1556 } 1557 out.append(plainText, pos, len); 1558 } 1559 1560 /** 1561 * Helper for escapeStringBody, which decides whether to escape a character. 1562 */ shouldEscapeChar(int codePoint, boolean escapeToAscii, JsEscapingMode jsEscapingMode)1563 private static boolean shouldEscapeChar(int codePoint, 1564 boolean escapeToAscii, JsEscapingMode jsEscapingMode) { 1565 // If non-ASCII chars should be escaped, identify non-ASCII code points. 1566 if (escapeToAscii && (codePoint < 0x20 || codePoint > 0x7e)) { 1567 return true; 1568 } 1569 1570 // If in JSON escaping mode, check JSON *and* JS escaping rules. The JS 1571 // escaping rules will escape more characters than needed for JSON, 1572 // but it is safe to escape any character in JSON. 1573 // TODO(bbavar): Remove unnecessary escaping for JSON, as long as it can be 1574 // shown that this change in legacy behavior is safe. 1575 if (jsEscapingMode == JsEscapingMode.JSON) { 1576 return mustEscapeCharInJsonString(codePoint) 1577 || mustEscapeCharInJsString(codePoint); 1578 } 1579 1580 // Finally, just check the default JS escaping rules. 1581 return mustEscapeCharInJsString(codePoint); 1582 } 1583 1584 /** 1585 * Returns a javascript representation of the character in a hex escaped 1586 * format. 1587 * 1588 * @param codePoint The codepoint to append. 1589 * @param out The buffer to which the hex representation should be appended. 1590 */ appendHexJavaScriptRepresentation( int codePoint, Appendable out)1591 private static void appendHexJavaScriptRepresentation( 1592 int codePoint, Appendable out) 1593 throws IOException { 1594 if (Character.isSupplementaryCodePoint(codePoint)) { 1595 // Handle supplementary unicode values which are not representable in 1596 // javascript. We deal with these by escaping them as two 4B sequences 1597 // so that they will round-trip properly when sent from java to javascript 1598 // and back. 1599 char[] surrogates = Character.toChars(codePoint); 1600 appendHexJavaScriptRepresentation(surrogates[0], out); 1601 appendHexJavaScriptRepresentation(surrogates[1], out); 1602 return; 1603 } 1604 out.append("\\u") 1605 .append(HEX_CHARS[(codePoint >>> 12) & 0xf]) 1606 .append(HEX_CHARS[(codePoint >>> 8) & 0xf]) 1607 .append(HEX_CHARS[(codePoint >>> 4) & 0xf]) 1608 .append(HEX_CHARS[codePoint & 0xf]); 1609 } 1610 1611 /** 1612 * Returns a javascript representation of the character in a hex escaped 1613 * format. Although this is a rather specific method, it is made public 1614 * because it is also used by the JSCompiler. 1615 * 1616 * @param ch The character to append. 1617 * @param pad true to force use of the full 3 digit representation. 1618 * @param out The buffer to which the hex representation should be appended. 1619 */ appendOctalJavaScriptRepresentation( char ch, boolean pad, Appendable out)1620 private static void appendOctalJavaScriptRepresentation( 1621 char ch, boolean pad, Appendable out) throws IOException { 1622 if (ch >= 0100 1623 // Be paranoid at the end of a string since someone might call 1624 // this method again with another string segment. 1625 || pad) { 1626 out.append('\\') 1627 .append(OCTAL_CHARS[(ch >>> 6) & 0x7]) 1628 .append(OCTAL_CHARS[(ch >>> 3) & 0x7]) 1629 .append(OCTAL_CHARS[ch & 0x7]); 1630 } else if (ch >= 010) { 1631 out.append('\\') 1632 .append(OCTAL_CHARS[(ch >>> 3) & 0x7]) 1633 .append(OCTAL_CHARS[ch & 0x7]); 1634 } else { 1635 out.append('\\') 1636 .append(OCTAL_CHARS[ch & 0x7]); 1637 } 1638 } 1639 1640 /** 1641 * Although this is a rather specific method, it is made public 1642 * because it is also used by the JSCompiler. 1643 * 1644 * @see #appendHexJavaScriptRepresentation(int, Appendable) 1645 */ appendHexJavaScriptRepresentation(StringBuilder sb, char c)1646 public static void appendHexJavaScriptRepresentation(StringBuilder sb, 1647 char c) { 1648 try { 1649 appendHexJavaScriptRepresentation(c, sb); 1650 } catch (IOException ex) { 1651 // StringBuilder does not throw IOException. 1652 throw new RuntimeException(ex); 1653 } 1654 } 1655 1656 /** 1657 * Undo escaping as performed in javaScriptEscape(.) 1658 * Throws an IllegalArgumentException if the string contains 1659 * bad escaping. 1660 */ javaScriptUnescape(String s)1661 public static String javaScriptUnescape(String s) { 1662 StringBuilder sb = new StringBuilder(s.length()); 1663 for (int i = 0; i < s.length(); ) { 1664 char c = s.charAt(i); 1665 if (c == '\\') { 1666 i = javaScriptUnescapeHelper(s, i + 1, sb); 1667 } else { 1668 sb.append(c); 1669 i++; 1670 } 1671 } 1672 return sb.toString(); 1673 } 1674 1675 /** 1676 * Looks for an escape code starting at index i of s, 1677 * and appends it to sb. 1678 * @return the index of the first character in s 1679 * after the escape code. 1680 * @throws IllegalArgumentException if the escape code 1681 * is invalid 1682 */ javaScriptUnescapeHelper(String s, int i, StringBuilder sb)1683 private static int javaScriptUnescapeHelper(String s, int i, 1684 StringBuilder sb) { 1685 if (i >= s.length()) { 1686 throw new IllegalArgumentException( 1687 "End-of-string after escape character in [" + s + "]"); 1688 } 1689 1690 char c = s.charAt(i++); 1691 switch (c) { 1692 case 'n': sb.append('\n'); break; 1693 case 'r': sb.append('\r'); break; 1694 case 't': sb.append('\t'); break; 1695 case 'b': sb.append('\b'); break; 1696 case 'f': sb.append('\f'); break; 1697 case '\\': 1698 case '\"': 1699 case '\'': 1700 case '>': 1701 sb.append(c); 1702 break; 1703 case '0': case '1': case '2': case '3': 1704 case '4': case '5': case '6': case '7': 1705 --i; // backup to first octal digit 1706 int nOctalDigits = 1; 1707 int digitLimit = c < '4' ? 3 : 2; 1708 while (nOctalDigits < digitLimit && i + nOctalDigits < s.length() 1709 && isOctal(s.charAt(i + nOctalDigits))) { 1710 ++nOctalDigits; 1711 } 1712 sb.append( 1713 (char) Integer.parseInt(s.substring(i, i + nOctalDigits), 8)); 1714 i += nOctalDigits; 1715 break; 1716 case 'x': 1717 case 'u': 1718 String hexCode; 1719 int nHexDigits = (c == 'u' ? 4 : 2); 1720 try { 1721 hexCode = s.substring(i, i + nHexDigits); 1722 } catch (IndexOutOfBoundsException ioobe) { 1723 throw new IllegalArgumentException( 1724 "Invalid unicode sequence [" + s.substring(i) + "] at index " + i 1725 + " in [" + s + "]"); 1726 } 1727 int unicodeValue; 1728 try { 1729 unicodeValue = Integer.parseInt(hexCode, 16); 1730 } catch (NumberFormatException nfe) { 1731 throw new IllegalArgumentException( 1732 "Invalid unicode sequence [" + hexCode + "] at index " + i + 1733 " in [" + s + "]"); 1734 } 1735 sb.append((char) unicodeValue); 1736 i += nHexDigits; 1737 break; 1738 default: 1739 throw new IllegalArgumentException( 1740 "Unknown escape code [" + c + "] at index " + i + " in [" + s + "]" 1741 ); 1742 } 1743 1744 return i; 1745 } 1746 1747 // C0 control characters except \t, \n, and \r and 0xFFFE and 0xFFFF 1748 private static final CharMatcher CONTROL_MATCHER = CharMatcher.anyOf( 1749 "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" + 1750 "\u0008\u000B\u000C\u000E\u000F" + 1751 "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" + 1752 "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + 1753 "\uFFFE\uFFFF"); 1754 1755 /** 1756 * Escape a string that is meant to be embedded in a CDATA section. 1757 * The returned string is guaranteed to be valid CDATA content. 1758 * The syntax of CDATA sections is the following: 1759 * <blockquote> 1760 * <code><[!CDATA[...]]></code> 1761 * </blockquote> 1762 * The only invalid character sequence in a CDATA tag is "]]>". 1763 * If this sequence is present in the input string, we replace 1764 * it by closing the current CDATA field, then write ']]&gt;', 1765 * then reopen a new CDATA section. 1766 */ 1767 public static String xmlCDataEscape(String s) { 1768 // Make sure there are no illegal control characters. 1769 s = CONTROL_MATCHER.removeFrom(s); 1770 // Return the original reference if the string doesn't have a match. 1771 int found = s.indexOf("]]>"); 1772 if (found == -1) { 1773 return s; 1774 } 1775 1776 // For each occurrence of "]]>", append a string that adds "]]>" after 1777 // the end of the CDATA which has just been closed, then opens a new CDATA. 1778 StringBuilder sb = new StringBuilder(); 1779 int prev = 0; 1780 do { 1781 sb.append(s.substring(prev, found + 3)); 1782 sb.append("]]><![CDATA["); 1783 prev = found + 3; 1784 } while ((found = s.indexOf("]]>", prev)) != -1); 1785 sb.append(s.substring(prev)); 1786 return sb.toString(); 1787 } 1788 1789 /** 1790 * We escape some characters in s to be able to insert strings into Java code 1791 * 1792 * @deprecated Use {@link CharEscapers#asciiHtmlEscaper()} and {@link 1793 * CharEscapers#javaCharEscaper()} or {@link CharEscapers#javaStringEscaper()} 1794 * instead. This method combines two forms of escaping in a way that's rarely 1795 * desired. 1796 */ 1797 @Deprecated 1798 public static String javaEscape(String s) { 1799 return JAVA_ESCAPE.escape(s); 1800 } 1801 1802 // Java escaper. 1803 private static final CharEscaper JAVA_ESCAPE = 1804 new CharEscaperBuilder() 1805 .addEscape('\n', "\\n") 1806 .addEscape('\r', "\\r") 1807 .addEscape('\t', "\\t") 1808 .addEscape('\\', "\\\\") 1809 .addEscape('\"', "\\\"") 1810 .addEscape('&', "&") 1811 .addEscape('<', "<") 1812 .addEscape('>', ">") 1813 .addEscape('\'', "\\\'") 1814 .toEscaper(); 1815 1816 /** 1817 * Escapes the special characters from a string so it can be used as part of 1818 * a regex pattern. This method is for use on gnu.regexp style regular 1819 * expressions. 1820 * 1821 * @deprecated Use {@link Pattern#quote(String)} instead. Note that it may not 1822 * be compatible with gnu.regexp style regular expressions. 1823 */ 1824 @Deprecated 1825 public static String regexEscape(String s) { 1826 return REGEX_ESCAPE.escape(s); 1827 } 1828 1829 // Regex escaper escapes all regex characters. 1830 private static final CharEscaper REGEX_ESCAPE = 1831 new CharEscaperBuilder() 1832 .addEscape('(', "\\(") 1833 .addEscape(')', "\\)") 1834 .addEscape('|', "\\|") 1835 .addEscape('*', "\\*") 1836 .addEscape('+', "\\+") 1837 .addEscape('?', "\\?") 1838 .addEscape('.', "\\.") 1839 .addEscape('{', "\\{") 1840 .addEscape('}', "\\}") 1841 .addEscape('[', "\\[") 1842 .addEscape(']', "\\]") 1843 .addEscape('$', "\\$") 1844 .addEscape('^', "\\^") 1845 .addEscape('\\', "\\\\") 1846 .toEscaper(); 1847 1848 /** 1849 * If you want to preserve the exact 1850 * current (odd) behavior when {@code doStrip} is {@code true}, use 1851 * {@code .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on 1852 * the splitter. 1853 * 1854 * @param in what to process 1855 * @param delimiter the delimiting string 1856 * @return the tokens 1857 * @deprecated see the detailed instructions under 1858 * {@link #split(String, String, boolean)} 1859 */ 1860 @Deprecated 1861 public static LinkedList<String> string2List( 1862 String in, String delimiter, boolean doStrip) { 1863 if (in == null) { 1864 return null; 1865 } 1866 1867 LinkedList<String> out = new LinkedList<String>(); 1868 string2Collection(in, delimiter, doStrip, out); 1869 return out; 1870 } 1871 1872 /** 1873 * See the detailed instructions under {@link 1874 * #split(String, String, boolean)}. Pass the resulting {@code Iterable} to 1875 * {@link com.google.common.collect.Sets#newHashSet(Iterable)}. If you want to 1876 * preserve the exact current (odd) behavior when {@code doStrip} is {@code 1877 * true}, use {@code 1878 * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the 1879 * splitter. 1880 * 1881 * @param in what to process 1882 * @param delimiter the delimiting string 1883 * @param doStrip to strip the substrings before adding to the list 1884 * @return the tokens 1885 * @deprecated see the detailed instructions under 1886 * {@link #split(String, String, boolean)} 1887 */ 1888 @Deprecated 1889 public static Set<String> string2Set( 1890 String in, String delimiter, boolean doStrip) { 1891 if (in == null) { 1892 return null; 1893 } 1894 1895 HashSet<String> out = new HashSet<String>(); 1896 string2Collection(in, delimiter, doStrip, out); 1897 return out; 1898 } 1899 1900 /** 1901 * See the detailed instructions under {@link 1902 * #split(String, String, boolean)}. If you want to preserve the exact current 1903 * (odd) behavior when {@code doStrip} is {@code true}, use {@code 1904 * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the 1905 * splitter. 1906 * 1907 * @param in The delimited input string to process 1908 * @param delimiter The string delimiting entries in the input string. 1909 * @param doStrip whether to strip the substrings before adding to the 1910 * collection 1911 * @param collection The collection to which the strings will be added. If 1912 * <code>null</code>, a new <code>List</code> will be created. 1913 * @return The collection to which the substrings were added. This is 1914 * syntactic sugar to allow call chaining. 1915 * @deprecated see the detailed instructions under 1916 * {@link #split(String, String, boolean)} 1917 */ 1918 @Deprecated 1919 public static Collection<String> string2Collection( 1920 String in, 1921 String delimiter, 1922 boolean doStrip, 1923 Collection<String> collection) { 1924 if (in == null) { 1925 return null; 1926 } 1927 if (collection == null) { 1928 collection = new ArrayList<String>(); 1929 } 1930 if (delimiter == null || delimiter.length() == 0) { 1931 collection.add(in); 1932 return collection; 1933 } 1934 1935 int fromIndex = 0; 1936 int pos; 1937 while ((pos = in.indexOf(delimiter, fromIndex)) >= 0) { 1938 String interim = in.substring(fromIndex, pos); 1939 if (doStrip) { 1940 interim = strip(interim); 1941 } 1942 if (!doStrip || interim.length() > 0) { 1943 collection.add(interim); 1944 } 1945 1946 fromIndex = pos + delimiter.length(); 1947 } 1948 1949 String interim = in.substring(fromIndex); 1950 if (doStrip) { 1951 interim = strip(interim); 1952 } 1953 if (!doStrip || interim.length() > 0) { 1954 collection.add(interim); 1955 } 1956 1957 return collection; 1958 } 1959 1960 /** 1961 * This converts a string to a Map. It will first split the string into 1962 * entries using delimEntry. Then each entry is split into a key and a value 1963 * using delimKey. By default we strip the keys. Use doStripEntry to strip 1964 * also the entries. 1965 * 1966 * Note that this method returns a {@link HashMap}, which means that entries 1967 * will be in no particular order. See {@link #stringToOrderedMap}. 1968 * 1969 * @param in the string to be processed 1970 * @param delimEntry delimiter for the entries 1971 * @param delimKey delimiter between keys and values 1972 * @param doStripEntry strip entries before inserting in the map 1973 * 1974 * @return HashMap 1975 */ string2Map( String in, String delimEntry, String delimKey, boolean doStripEntry)1976 public static HashMap<String, String> string2Map( 1977 String in, String delimEntry, String delimKey, 1978 boolean doStripEntry) { 1979 if (in == null) { 1980 return null; 1981 } 1982 1983 return stringToMapImpl(new HashMap<String, String>(), in, delimEntry, 1984 delimKey, doStripEntry); 1985 } 1986 1987 /** 1988 * This converts a string to a Map, with entries in the same order as the 1989 * key/value pairs in the input string. It will first split the string into 1990 * entries using delimEntry. Then each entry is split into a key and a value 1991 * using delimKey. By default we strip the keys. Use doStripEntry to strip 1992 * also the entries. 1993 * 1994 * @param in the string to be processed 1995 * @param delimEntry delimiter for the entries 1996 * @param delimKey delimiter between keys and values 1997 * @param doStripEntry strip entries before inserting in the map 1998 * 1999 * @return key/value pairs as a Map, in order 2000 */ stringToOrderedMap( String in, String delimEntry, String delimKey, boolean doStripEntry)2001 public static Map<String, String> stringToOrderedMap( 2002 String in, String delimEntry, String delimKey, 2003 boolean doStripEntry) { 2004 if (in == null) { 2005 return null; 2006 } 2007 2008 return stringToMapImpl(new LinkedHashMap<String, String>(), in, delimEntry, 2009 delimKey, doStripEntry); 2010 } 2011 2012 /** 2013 * This adds key/value pairs from the given string to the given Map. 2014 * It will first split the string into entries using delimEntry. Then each 2015 * entry is split into a key and a value using delimKey. By default we 2016 * strip the keys. Use doStripEntry to strip also the entries. 2017 * 2018 * @param out - Map to output into 2019 * @param in - the string to be processed 2020 * @param delimEntry - delimiter for the entries 2021 * @param delimKey - delimiter between keys and values 2022 * @param doStripEntry - strip entries before inserting in the map 2023 * @return out, for caller's convenience 2024 */ stringToMapImpl(T out, String in, String delimEntry, String delimKey, boolean doStripEntry)2025 private static <T extends Map<String, String>> T stringToMapImpl(T out, 2026 String in, String delimEntry, String delimKey, boolean doStripEntry) { 2027 2028 if (isEmpty(delimEntry) || isEmpty(delimKey)) { 2029 out.put(strip(in), ""); 2030 return out; 2031 } 2032 2033 Iterator<String> it = string2List(in, delimEntry, false).iterator(); 2034 int len = delimKey.length(); 2035 while (it.hasNext()) { 2036 String entry = it.next(); 2037 int pos = entry.indexOf(delimKey); 2038 if (pos > 0) { 2039 String value = entry.substring(pos + len); 2040 if (doStripEntry) { 2041 value = strip(value); 2042 } 2043 out.put(strip(entry.substring(0, pos)), value); 2044 } else { 2045 out.put(strip(entry), ""); 2046 } 2047 } 2048 2049 return out; 2050 } 2051 2052 /** 2053 * This function concatenates the elements of a Map in a string with form 2054 * "<key1><sepKey><value1><sepEntry>...<keyN><sepKey><valueN>" 2055 * 2056 * @param in - the map to be converted 2057 * @param sepKey - the separator to put between key and value 2058 * @param sepEntry - the separator to put between map entries 2059 * @return String 2060 * @deprecated create a {@link MapJoiner}, for example {@code 2061 * Joiner.on(sepEntry).withKeyValueSeparator(sepKey)}. Ensure that your 2062 * map is non-null and use this map joiner's {@link MapJoiner#join(Map)} 2063 * method. To preserve behavior exactly, just in-line this method call. 2064 */ map2String( Map<K, V> in, String sepKey, String sepEntry)2065 @Deprecated public static <K, V> String map2String( 2066 Map<K, V> in, String sepKey, String sepEntry) { 2067 return (in == null) ? null : Joiner 2068 .on(sepEntry) 2069 .useForNull("null") 2070 .withKeyValueSeparator(sepKey) 2071 .join(in); 2072 } 2073 2074 /** 2075 * Given a map, creates and returns a new map in which all keys are the 2076 * lower-cased version of each key. 2077 * 2078 * @param map A map containing String keys to be lowercased 2079 * @throws IllegalArgumentException if the map contains duplicate string keys 2080 * after lower casing 2081 */ lowercaseKeys(Map<String, V> map)2082 public static <V> Map<String, V> lowercaseKeys(Map<String, V> map) { 2083 Map<String, V> result = new HashMap<String, V>(map.size()); 2084 for (Map.Entry<String, V> entry : map.entrySet()) { 2085 String key = entry.getKey(); 2086 if (result.containsKey(key.toLowerCase())) { 2087 throw new IllegalArgumentException( 2088 "Duplicate string key in map when lower casing"); 2089 } 2090 result.put(key.toLowerCase(), entry.getValue()); 2091 } 2092 return result; 2093 } 2094 2095 /** 2096 * Replaces any string of adjacent whitespace characters with the whitespace 2097 * character " ". 2098 * 2099 * @param str the string you want to munge 2100 * @return String with no more excessive whitespace! 2101 * @deprecated ensure the string is not null and use {@code 2102 * CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ')}; also consider 2103 * whether you really want the legacy whitespace definition, or something 2104 * more standard like {@link CharMatcher#WHITESPACE}. 2105 */ collapseWhitespace(String str)2106 @Deprecated public static String collapseWhitespace(String str) { 2107 return (str == null) ? null 2108 : CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' '); 2109 } 2110 2111 /** 2112 * Replaces any string of matched characters with the supplied string.<p> 2113 * 2114 * This is a more general version of collapseWhitespace. 2115 * 2116 * <pre> 2117 * E.g. collapse("hello world", " ", "::") 2118 * will return the following string: "hello::world" 2119 * </pre> 2120 * 2121 * @param str the string you want to munge 2122 * @param chars all of the characters to be considered for munge 2123 * @param replacement the replacement string 2124 * @return munged and replaced string. 2125 * @deprecated if {@code replacement} is the empty string, use {@link 2126 * CharMatcher#removeFrom(CharSequence)}; if it is a single character, 2127 * use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer 2128 * replacement strings use {@link String#replaceAll(String, String)} with 2129 * a regular expression that matches one or more occurrences of {@code 2130 * chars}. In all cases you must first ensure that {@code str} is not 2131 * null. 2132 */ collapse( String str, String chars, String replacement)2133 @Deprecated public static String collapse( 2134 String str, String chars, String replacement) { 2135 if (str == null) { 2136 return null; 2137 } 2138 2139 StringBuilder newStr = new StringBuilder(); 2140 2141 boolean prevCharMatched = false; 2142 char c; 2143 for (int i = 0; i < str.length(); i++) { 2144 c = str.charAt(i); 2145 if (chars.indexOf(c) != -1) { 2146 // this character is matched 2147 if (prevCharMatched) { 2148 // apparently a string of matched chars, so don't append anything 2149 // to the string 2150 continue; 2151 } 2152 prevCharMatched = true; 2153 newStr.append(replacement); 2154 } else { 2155 prevCharMatched = false; 2156 newStr.append(c); 2157 } 2158 } 2159 2160 return newStr.toString(); 2161 } 2162 2163 /** 2164 * Returns a string with all sequences of ISO control chars (0x00 to 0x1F and 2165 * 0x7F to 0x9F) replaced by the supplied string. ISO control characters are 2166 * identified via {@link Character#isISOControl(char)}. 2167 * 2168 * @param str the string you want to strip of ISO control chars 2169 * @param replacement the replacement string 2170 * @return a String with all control characters replaced by the replacement 2171 * string, or null if input is null. 2172 * @deprecated use {@link CharMatcher#JAVA_ISO_CONTROL}. If {@code 2173 * replacement} is the empty string, use {@link 2174 * CharMatcher#removeFrom(CharSequence)}; if it is a single character, 2175 * use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer 2176 * replacement strings use 2177 * {@code str.replaceAll("\p{Cntrl}+", replacement)}. 2178 * In all cases you must first ensure that {@code str} is not null. 2179 */ collapseControlChars( String str, String replacement)2180 @Deprecated public static String collapseControlChars( 2181 String str, String replacement) { 2182 /* 2183 * We re-implement the StringUtil.collapse() loop here rather than call 2184 * collapse() with an input String of control chars, because matching via 2185 * isISOControl() is about 10x faster. 2186 */ 2187 if (str == null) { 2188 return null; 2189 } 2190 2191 StringBuilder newStr = new StringBuilder(); 2192 2193 boolean prevCharMatched = false; 2194 char c; 2195 for (int i = 0; i < str.length(); i++) { 2196 c = str.charAt(i); 2197 if (Character.isISOControl(c)) { 2198 // this character is matched 2199 if (prevCharMatched) { 2200 // apparently a string of matched chars, so don't append anything 2201 // to the string 2202 continue; 2203 } 2204 prevCharMatched = true; 2205 newStr.append(replacement); 2206 } else { 2207 prevCharMatched = false; 2208 newStr.append(c); 2209 } 2210 } 2211 2212 return newStr.toString(); 2213 } 2214 2215 /** 2216 * Read a String of up to maxLength bytes from an InputStream. 2217 * 2218 * <p>Note that this method uses the default platform encoding, and expects 2219 * that encoding to be single-byte, which is not always the case. Its use 2220 * is discouraged. For reading the entire stream (maxLength == -1) you can use: 2221 * <pre> 2222 * CharStreams.toString(new InputStreamReader(is, Charsets.ISO_8859_1)) 2223 * </pre> 2224 * {@code CharStreams} is in the {@code com.google.common.io} package. 2225 * 2226 * <p>For maxLength >= 0 a literal translation would be 2227 * <pre> 2228 * CharStreams.toString(new InputStreamReader( 2229 * new LimitInputStream(is, maxLength), Charsets.ISO_8859_1)) 2230 * </pre> 2231 * For multi-byte encodings that is broken because the limit could end in 2232 * the middle of the character--it would be better to limit the reader than 2233 * the underlying stream. 2234 * 2235 * @param is input stream 2236 * @param maxLength max number of bytes to read from "is". If this is -1, we 2237 * read everything. 2238 * 2239 * @return String up to maxLength bytes, read from "is" 2240 * @deprecated see the advice above 2241 */ stream2String(InputStream is, int maxLength)2242 @Deprecated public static String stream2String(InputStream is, int maxLength) 2243 throws IOException { 2244 byte[] buffer = new byte[4096]; 2245 StringWriter sw = new StringWriter(); 2246 int totalRead = 0; 2247 int read = 0; 2248 2249 do { 2250 sw.write(new String(buffer, 0, read)); 2251 totalRead += read; 2252 read = is.read(buffer, 0, buffer.length); 2253 } while (((-1 == maxLength) || (totalRead < maxLength)) && (read != -1)); 2254 2255 return sw.toString(); 2256 } 2257 2258 /** 2259 * Parse a list of substrings separated by a given delimiter. The delimiter 2260 * can also appear in substrings (just double them): 2261 * 2262 * parseDelimitedString("this|is", '|') returns ["this","is"] 2263 * parseDelimitedString("this||is", '|') returns ["this|is"] 2264 * 2265 * @param list String containing delimited substrings 2266 * @param delimiter Delimiter (anything except ' ' is allowed) 2267 * 2268 * @return String[] A String array of parsed substrings 2269 */ parseDelimitedList(String list, char delimiter)2270 public static String[] parseDelimitedList(String list, 2271 char delimiter) { 2272 String delim = "" + delimiter; 2273 // Append a sentinel of delimiter + space 2274 // (see comments below for more info) 2275 StringTokenizer st = new StringTokenizer(list + delim + " ", 2276 delim, 2277 true); 2278 ArrayList<String> v = new ArrayList<String>(); 2279 String lastToken = ""; 2280 StringBuilder word = new StringBuilder(); 2281 2282 // We keep a sliding window of 2 tokens 2283 // 2284 // delimiter : delimiter -> append delimiter to current word 2285 // and clear most recent token 2286 // (so delim : delim : delim will not 2287 // be treated as two escaped delims.) 2288 // 2289 // tok : delimiter -> append tok to current word 2290 // 2291 // delimiter : tok -> add current word to list, and clear it. 2292 // (We append a sentinel that conforms to this 2293 // pattern to make sure we've pushed every parsed token) 2294 while (st.hasMoreTokens()) { 2295 String tok = st.nextToken(); 2296 if (lastToken != null) { 2297 if (tok.equals(delim)) { 2298 word.append(lastToken); 2299 if (lastToken.equals(delim)) { tok = null; } 2300 } else { 2301 if (word.length() != 0) { 2302 v.add(word.toString()); 2303 } 2304 word.setLength(0); 2305 } 2306 } 2307 lastToken = tok; 2308 } 2309 2310 return v.toArray(new String[0]); 2311 } 2312 2313 /** 2314 * Compares two strings, guarding against nulls. 2315 * 2316 * @param nullsAreGreater true if nulls should be greater than any string, 2317 * false is less than. 2318 * @deprecated use {@link String#CASE_INSENSITIVE_ORDER}, together with 2319 * {@link com.google.common.collect.Ordering#nullsFirst()} or 2320 * {@link com.google.common.collect.Ordering#nullsLast()} if 2321 * needed 2322 */ compareToIgnoreCase(String s1, String s2, boolean nullsAreGreater)2323 @Deprecated public static int compareToIgnoreCase(String s1, String s2, 2324 boolean nullsAreGreater) { 2325 if (s1 == s2) { 2326 return 0; // Either both the same String, or both null 2327 } 2328 if (s1 == null) { 2329 return nullsAreGreater ? 1 : -1; 2330 } 2331 if (s2 == null) { 2332 return nullsAreGreater ? -1 : 1; 2333 } 2334 return s1.compareToIgnoreCase(s2); 2335 } 2336 2337 /** 2338 * Splits s with delimiters in delimiter and returns the last token 2339 */ lastToken(String s, String delimiter)2340 public static String lastToken(String s, String delimiter) { 2341 return s.substring(CharMatcher.anyOf(delimiter).lastIndexIn(s) + 1); 2342 } 2343 2344 private static final Pattern characterReferencePattern = 2345 Pattern.compile("&#?[a-zA-Z0-9]{1,8};"); 2346 2347 /** 2348 * Determines if a string contains what looks like an html character 2349 * reference. Useful for deciding whether unescaping is necessary. 2350 */ containsCharRef(String s)2351 public static boolean containsCharRef(String s) { 2352 return characterReferencePattern.matcher(s).find(); 2353 } 2354 2355 /** 2356 * Determines if a string is a Hebrew word. A string is considered to be 2357 * a Hebrew word if {@link #isHebrew(int)} is true for any of its characters. 2358 */ isHebrew(String s)2359 public static boolean isHebrew(String s) { 2360 int len = s.length(); 2361 for (int i = 0; i < len; ++i) { 2362 if (isHebrew(s.codePointAt(i))) { 2363 return true; 2364 } 2365 } 2366 return false; 2367 } 2368 2369 /** 2370 * Determines if a character is a Hebrew character. 2371 */ isHebrew(int codePoint)2372 public static boolean isHebrew(int codePoint) { 2373 return Character.UnicodeBlock.HEBREW.equals( 2374 Character.UnicodeBlock.of(codePoint)); 2375 } 2376 2377 /** 2378 * Determines if a string is a CJK word. A string is considered to be CJK 2379 * if {@link #isCjk(char)} is true for any of its characters. 2380 */ isCjk(String s)2381 public static boolean isCjk(String s) { 2382 int len = s.length(); 2383 for (int i = 0; i < len; ++i) { 2384 if (isCjk(s.codePointAt(i))) { 2385 return true; 2386 } 2387 } 2388 return false; 2389 } 2390 2391 /** 2392 * Unicode code blocks containing CJK characters. 2393 */ 2394 private static final Set<Character.UnicodeBlock> CJK_BLOCKS; 2395 static { 2396 Set<Character.UnicodeBlock> set = new HashSet<Character.UnicodeBlock>(); 2397 set.add(Character.UnicodeBlock.HANGUL_JAMO); 2398 set.add(Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT); 2399 set.add(Character.UnicodeBlock.KANGXI_RADICALS); 2400 set.add(Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION); 2401 set.add(Character.UnicodeBlock.HIRAGANA); 2402 set.add(Character.UnicodeBlock.KATAKANA); 2403 set.add(Character.UnicodeBlock.BOPOMOFO); 2404 set.add(Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO); 2405 set.add(Character.UnicodeBlock.KANBUN); 2406 set.add(Character.UnicodeBlock.BOPOMOFO_EXTENDED); 2407 set.add(Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS); 2408 set.add(Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS); 2409 set.add(Character.UnicodeBlock.CJK_COMPATIBILITY); 2410 set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A); 2411 set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS); 2412 set.add(Character.UnicodeBlock.HANGUL_SYLLABLES); 2413 set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS); 2414 set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS); 2415 set.add(Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS); 2416 set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B); 2417 set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT); 2418 CJK_BLOCKS = Collections.unmodifiableSet(set); 2419 } 2420 2421 /** 2422 * Determines if a character is a CJK ideograph or a character typically 2423 * used only in CJK text. 2424 * 2425 * Note: This function cannot handle supplementary characters. To handle all 2426 * Unicode characters, including supplementary characters, use the function 2427 * {@link #isCjk(int)}. 2428 */ isCjk(char ch)2429 public static boolean isCjk(char ch) { 2430 return isCjk((int) ch); 2431 } 2432 2433 /** 2434 * Determines if a character is a CJK ideograph or a character typically 2435 * used only in CJK text. 2436 */ isCjk(int codePoint)2437 public static boolean isCjk(int codePoint) { 2438 // Time-saving early exit for all Latin-1 characters. 2439 if ((codePoint & 0xFFFFFF00) == 0) { 2440 return false; 2441 } 2442 2443 return CJK_BLOCKS.contains(Character.UnicodeBlock.of(codePoint)); 2444 } 2445 2446 /** 2447 * Returns the approximate display width of the string, measured in units of 2448 * ascii characters. 2449 * 2450 * @see StringUtil#displayWidth(char) 2451 */ displayWidth(String s)2452 public static int displayWidth(String s) { 2453 // TODO(kevinb): could reimplement this as 2454 // return s.length() * 2 - CharMatcher.SINGLE_WIDTH.countIn(s); 2455 int width = 0; 2456 int len = s.length(); 2457 for (int i = 0; i < len; ++i) { 2458 width += displayWidth(s.charAt(i)); 2459 } 2460 return width; 2461 } 2462 2463 /** 2464 * Returns the approximate display width of the character, measured 2465 * in units of ascii characters. 2466 * 2467 * This method should err on the side of caution. By default, characters 2468 * are assumed to have width 2; this covers CJK ideographs, various 2469 * symbols and miscellaneous weird scripts. Given below are some Unicode 2470 * ranges for which it seems safe to assume that no character is 2471 * substantially wider than an ascii character: 2472 * - Latin, extended Latin, even more extended Latin. 2473 * - Greek, extended Greek, Cyrillic. 2474 * - Some symbols (including currency symbols) and punctuation. 2475 * - Half-width Katakana and Hangul. 2476 * - Hebrew 2477 * - Arabic 2478 * - Thai 2479 * Characters in these ranges are given a width of 1. 2480 * 2481 * IMPORTANT: this function has analogs in C++ (encodingutils.cc, 2482 * named UnicodeCharWidth) and JavaScript 2483 * (java/com/google/ads/common/frontend/adwordsbase/resources/CreateAdUtil.js), 2484 * which need to be updated if you change the implementation here. 2485 */ displayWidth(char ch)2486 public static int displayWidth(char ch) { 2487 if (ch <= '\u04f9' || // CYRILLIC SMALL LETTER YERU WITH DIAERESIS 2488 ch == '\u05be' || // HEBREW PUNCTUATION MAQAF 2489 (ch >= '\u05d0' && ch <= '\u05ea') || // HEBREW LETTER ALEF ... TAV 2490 ch == '\u05F3' || // HEBREW PUNCTUATION GERESH 2491 ch == '\u05f4' || // HEBREW PUNCTUATION GERSHAYIM 2492 (ch >= '\u0600' && ch <= '\u06ff') || // Block=Arabic 2493 (ch >= '\u0750' && ch <= '\u077f') || // Block=Arabic_Supplement 2494 (ch >= '\ufb50' && ch <= '\ufdff') || // Block=Arabic_Presentation_Forms-A 2495 (ch >= '\ufe70' && ch <= '\ufeff') || // Block=Arabic_Presentation_Forms-B 2496 (ch >= '\u1e00' && ch <= '\u20af') || /* LATIN CAPITAL LETTER A WITH RING BELOW 2497 ... DRACHMA SIGN */ 2498 (ch >= '\u2100' && ch <= '\u213a') || // ACCOUNT OF ... ROTATED CAPITAL Q 2499 (ch >= '\u0e00' && ch <= '\u0e7f') || // Thai 2500 (ch >= '\uff61' && ch <= '\uffdc')) { /* HALFWIDTH IDEOGRAPHIC FULL STOP 2501 ... HALFWIDTH HANGUL LETTER I */ 2502 return 1; 2503 } 2504 return 2; 2505 } 2506 2507 /** 2508 * @return a string representation of the given native array. 2509 */ toString(float[] iArray)2510 public static String toString(float[] iArray) { 2511 if (iArray == null) { 2512 return "NULL"; 2513 } 2514 2515 StringBuilder buffer = new StringBuilder(); 2516 buffer.append("["); 2517 for (int i = 0; i < iArray.length; i++) { 2518 buffer.append(iArray[i]); 2519 if (i != (iArray.length - 1)) { 2520 buffer.append(", "); 2521 } 2522 } 2523 buffer.append("]"); 2524 return buffer.toString(); 2525 } 2526 2527 /** 2528 * @return a string representation of the given native array. 2529 */ toString(long[] iArray)2530 public static String toString(long[] iArray) { 2531 if (iArray == null) { 2532 return "NULL"; 2533 } 2534 2535 StringBuilder buffer = new StringBuilder(); 2536 buffer.append("["); 2537 for (int i = 0; i < iArray.length; i++) { 2538 buffer.append(iArray[i]); 2539 if (i != (iArray.length - 1)) { 2540 buffer.append(", "); 2541 } 2542 } 2543 buffer.append("]"); 2544 return buffer.toString(); 2545 } 2546 2547 /** 2548 * @return a string representation of the given native array 2549 */ toString(int[] iArray)2550 public static String toString(int[] iArray) { 2551 if (iArray == null) { 2552 return "NULL"; 2553 } 2554 2555 StringBuilder buffer = new StringBuilder(); 2556 buffer.append("["); 2557 for (int i = 0; i < iArray.length; i++) { 2558 buffer.append(iArray[i]); 2559 if (i != (iArray.length - 1)) { 2560 buffer.append(", "); 2561 } 2562 } 2563 buffer.append("]"); 2564 return buffer.toString(); 2565 } 2566 2567 /** 2568 * @return a string representation of the given array. 2569 */ toString(String[] iArray)2570 public static String toString(String[] iArray) { 2571 if (iArray == null) { return "NULL"; } 2572 2573 StringBuilder buffer = new StringBuilder(); 2574 buffer.append("["); 2575 for (int i = 0; i < iArray.length; i++) { 2576 buffer.append("'").append(iArray[i]).append("'"); 2577 if (i != iArray.length - 1) { 2578 buffer.append(", "); 2579 } 2580 } 2581 buffer.append("]"); 2582 2583 return buffer.toString(); 2584 } 2585 2586 /** 2587 * Returns the string, in single quotes, or "NULL". Intended only for 2588 * logging. 2589 * 2590 * @param s the string 2591 * @return the string, in single quotes, or the string "null" if it's null. 2592 */ toString(String s)2593 public static String toString(String s) { 2594 if (s == null) { 2595 return "NULL"; 2596 } else { 2597 return new StringBuilder(s.length() + 2).append("'").append(s) 2598 .append("'").toString(); 2599 } 2600 } 2601 2602 /** 2603 * @return a string representation of the given native array 2604 */ toString(int[][] iArray)2605 public static String toString(int[][] iArray) { 2606 if (iArray == null) { 2607 return "NULL"; 2608 } 2609 2610 StringBuilder buffer = new StringBuilder(); 2611 buffer.append("["); 2612 for (int i = 0; i < iArray.length; i++) { 2613 buffer.append("["); 2614 for (int j = 0; j < iArray[i].length; j++) { 2615 buffer.append(iArray[i][j]); 2616 if (j != (iArray[i].length - 1)) { 2617 buffer.append(", "); 2618 } 2619 } 2620 buffer.append("]"); 2621 if (i != iArray.length - 1) { 2622 buffer.append(" "); 2623 } 2624 } 2625 buffer.append("]"); 2626 return buffer.toString(); 2627 } 2628 2629 /** 2630 * @return a string representation of the given native array. 2631 */ toString(long[][] iArray)2632 public static String toString(long[][] iArray) { 2633 if (iArray == null) { return "NULL"; } 2634 2635 StringBuilder buffer = new StringBuilder(); 2636 buffer.append("["); 2637 for (int i = 0; i < iArray.length; i++) { 2638 buffer.append("["); 2639 for (int j = 0; j < iArray[i].length; j++) { 2640 buffer.append(iArray[i][j]); 2641 if (j != (iArray[i].length - 1)) { 2642 buffer.append(", "); 2643 } 2644 } 2645 buffer.append("]"); 2646 if (i != iArray.length - 1) { 2647 buffer.append(" "); 2648 } 2649 } 2650 buffer.append("]"); 2651 return buffer.toString(); 2652 } 2653 2654 /** 2655 * @return a String representation of the given object array. 2656 * The strings are obtained by calling toString() on the 2657 * underlying objects. 2658 */ toString(Object[] obj)2659 public static String toString(Object[] obj) { 2660 if (obj == null) { return "NULL"; } 2661 StringBuilder tmp = new StringBuilder(); 2662 tmp.append("["); 2663 for (int i = 0; i < obj.length; i++) { 2664 tmp.append(obj[i].toString()); 2665 if (i != obj.length - 1) { 2666 tmp.append(","); 2667 } 2668 } 2669 tmp.append("]"); 2670 return tmp.toString(); 2671 } 2672 2673 private static final char[] HEX_CHARS 2674 = { '0', '1', '2', '3', '4', '5', '6', '7', 2675 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; 2676 private static final char[] OCTAL_CHARS = HEX_CHARS; // ignore the last 8 :) 2677 2678 /** 2679 * Convert a byte array to a hex-encoding string: "a33bff00..." 2680 * 2681 * @deprecated Use {@link ByteArrays#toHexString}. 2682 */ bytesToHexString(final byte[] bytes)2683 @Deprecated public static String bytesToHexString(final byte[] bytes) { 2684 return ByteArrays.toHexString(bytes); 2685 } 2686 2687 /** 2688 * Convert a byte array to a hex-encoding string with the specified 2689 * delimiter: "a3<delimiter>3b<delimiter>ff..." 2690 */ bytesToHexString(final byte[] bytes, Character delimiter)2691 public static String bytesToHexString(final byte[] bytes, 2692 Character delimiter) { 2693 StringBuilder hex = 2694 new StringBuilder(bytes.length * (delimiter == null ? 2 : 3)); 2695 int nibble1, nibble2; 2696 for (int i = 0; i < bytes.length; i++) { 2697 nibble1 = (bytes[i] >>> 4) & 0xf; 2698 nibble2 = bytes[i] & 0xf; 2699 if (i > 0 && delimiter != null) { hex.append(delimiter.charValue()); } 2700 hex.append(HEX_CHARS[nibble1]); 2701 hex.append(HEX_CHARS[nibble2]); 2702 } 2703 return hex.toString(); 2704 } 2705 2706 /** 2707 * Safely convert the string to uppercase. 2708 * @return upper case representation of the String; or null if 2709 * the input string is null. 2710 */ toUpperCase(String src)2711 public static String toUpperCase(String src) { 2712 if (src == null) { 2713 return null; 2714 } else { 2715 return src.toUpperCase(); 2716 } 2717 } 2718 2719 /** 2720 * Safely convert the string to lowercase. 2721 * @return lower case representation of the String; or null if 2722 * the input string is null. 2723 */ toLowerCase(String src)2724 public static String toLowerCase(String src) { 2725 if (src == null) { 2726 return null; 2727 } else { 2728 return src.toLowerCase(); 2729 } 2730 } 2731 2732 private static final Pattern dbSpecPattern = 2733 Pattern.compile("(.*)\\{(\\d+),(\\d+)\\}(.*)"); 2734 2735 /** 2736 * @param dbSpecComponent a single component of a DBDescriptor spec 2737 * (e.g. the host or database component). The expected format of the string is: 2738 * <br> 2739 * <center>(prefix){(digits),(digits)}(suffix)</center> 2740 * </br> 2741 * @return a shard expansion of the given String. 2742 * Note that unless the pattern is matched exactly, no expansion is 2743 * performed and the original string is returned unaltered. 2744 * For example, 'db{0,1}.adz' is expanded into 'db0.adz, db1.adz'. 2745 * Note that this method is added to StringUtil instead of 2746 * DBDescriptor to better encapsulate the choice of regexp implementation. 2747 * @throws IllegalArgumentException if the string does not parse. 2748 */ expandShardNames(String dbSpecComponent)2749 public static String expandShardNames(String dbSpecComponent) 2750 throws IllegalArgumentException, IllegalStateException { 2751 2752 Matcher matcher = dbSpecPattern.matcher(dbSpecComponent); 2753 if (matcher.find()) { 2754 try { 2755 String prefix = dbSpecComponent.substring( 2756 matcher.start(1), matcher.end(1)); 2757 int minShard = 2758 Integer.parseInt( 2759 dbSpecComponent.substring( 2760 matcher.start(2), matcher.end(2))); 2761 int maxShard = 2762 Integer.parseInt( 2763 dbSpecComponent.substring( 2764 matcher.start(3), matcher.end(3))); 2765 String suffix = dbSpecComponent.substring( 2766 matcher.start(4), matcher.end(4)); 2767 //Log2.logEvent(prefix + " " + minShard + " " + maxShard + " " + suffix); 2768 if (minShard > maxShard) { 2769 throw new IllegalArgumentException( 2770 "Maximum shard must be greater than or equal to " + 2771 "the minimum shard"); 2772 } 2773 StringBuilder tmp = new StringBuilder(); 2774 for (int shard = minShard; shard <= maxShard; shard++) { 2775 tmp.append(prefix).append(shard).append(suffix); 2776 if (shard != maxShard) { 2777 tmp.append(","); 2778 } 2779 } 2780 return tmp.toString(); 2781 } catch (NumberFormatException nfex) { 2782 throw new IllegalArgumentException( 2783 "Malformed DB specification component: " + dbSpecComponent); 2784 } 2785 } else { 2786 return dbSpecComponent; 2787 } 2788 } 2789 2790 2791 /** 2792 * Returns a string that is equivalent to the specified string with its 2793 * first character converted to uppercase as by {@link String#toUpperCase()}. 2794 * The returned string will have the same value as the specified string if 2795 * its first character is non-alphabetic, if its first character is already 2796 * uppercase, or if the specified string is of length 0. 2797 * 2798 * <p>For example: 2799 * <pre> 2800 * capitalize("foo bar").equals("Foo bar"); 2801 * capitalize("2b or not 2b").equals("2b or not 2b") 2802 * capitalize("Foo bar").equals("Foo bar"); 2803 * capitalize("").equals(""); 2804 * </pre> 2805 * 2806 * @param s the string whose first character is to be uppercased 2807 * @return a string equivalent to <tt>s</tt> with its first character 2808 * converted to uppercase 2809 * @throws NullPointerException if <tt>s</tt> is null 2810 */ capitalize(String s)2811 public static String capitalize(String s) { 2812 if (s.length() == 0) { 2813 return s; 2814 } 2815 char first = s.charAt(0); 2816 char capitalized = Character.toUpperCase(first); 2817 return (first == capitalized) 2818 ? s 2819 : capitalized + s.substring(1); 2820 } 2821 2822 /** 2823 * Examine a string to see if it starts with a given prefix (case 2824 * insensitive). Just like String.startsWith() except doesn't 2825 * respect case. Strings are compared in the same way as in 2826 * {@link String#equalsIgnoreCase}. 2827 * 2828 * @param str the string to examine 2829 * @param prefix the prefix to look for 2830 * @return a boolean indicating if str starts with prefix (case insensitive) 2831 */ startsWithIgnoreCase(String str, String prefix)2832 public static boolean startsWithIgnoreCase(String str, String prefix) { 2833 return str.regionMatches(true, 0, prefix, 0, prefix.length()); 2834 } 2835 2836 /** 2837 * Examine a string to see if it ends with a given suffix (case 2838 * insensitive). Just like String.endsWith() except doesn't respect 2839 * case. Strings are compared in the same way as in 2840 * {@link String#equalsIgnoreCase}. 2841 * 2842 * @param str the string to examine 2843 * @param suffix the suffix to look for 2844 * @return a boolean indicating if str ends with suffix (case insensitive) 2845 */ endsWithIgnoreCase(String str, String suffix)2846 public static boolean endsWithIgnoreCase(String str, String suffix) { 2847 int len = suffix.length(); 2848 return str.regionMatches(true, str.length() - len, suffix, 0, len); 2849 } 2850 2851 /** 2852 * @param c one codePoint 2853 * @return the number of bytes needed to encode this codePoint in UTF-8 2854 */ bytesUtf8(int c)2855 private static int bytesUtf8(int c) { 2856 if (c < 0x80) { 2857 return 1; 2858 } else if (c < 0x00800) { 2859 return 2; 2860 } else if (c < 0x10000) { 2861 return 3; 2862 } else if (c < 0x200000) { 2863 return 4; 2864 2865 // RFC 3629 forbids the use of UTF-8 for codePoint greater than 0x10FFFF, 2866 // so if the caller respects this RFC, this should not happen 2867 } else if (c < 0x4000000) { 2868 return 5; 2869 } else { 2870 return 6; 2871 } 2872 } 2873 2874 /** 2875 * @param str a string 2876 * @return the number of bytes required to represent this string in UTF-8 2877 */ bytesStorage(String str)2878 public static int bytesStorage(String str) { 2879 // offsetByCodePoint has a bug if its argument is the result of a 2880 // call to substring. To avoid this, we create a new String 2881 // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664 2882 String s = new String(str); 2883 2884 int len = 0; 2885 for (int i = 0; i < s.length(); i = s.offsetByCodePoints(i, 1)) { 2886 len += bytesUtf8(s.codePointAt(i)); 2887 } 2888 return len; 2889 } 2890 2891 /** 2892 * @param str a string 2893 * @param maxbytes 2894 * @return the beginning of the string, so that it uses less than 2895 * maxbytes bytes in UTF-8 2896 * @throws IndexOutOfBoundsException if maxbytes is negative 2897 */ truncateStringForUtf8Storage(String str, int maxbytes)2898 public static String truncateStringForUtf8Storage(String str, int maxbytes) { 2899 if (maxbytes < 0) { 2900 throw new IndexOutOfBoundsException(); 2901 } 2902 2903 // offsetByCodePoint has a bug if its argument is the result of a 2904 // call to substring. To avoid this, we create a new String 2905 // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664 2906 // TODO(cquinn): should be fixed as of 1.5.0_01 2907 String s = new String(str); 2908 2909 int codepoints = 0; 2910 int bytesUsed = 0; 2911 for (codepoints = 0; codepoints < s.length(); 2912 codepoints = s.offsetByCodePoints(codepoints, 1)) { 2913 int glyphBytes = StringUtil.bytesUtf8(s.codePointAt(codepoints)); 2914 if (bytesUsed + glyphBytes > maxbytes) { 2915 break; 2916 } 2917 bytesUsed += glyphBytes; 2918 } 2919 return s.substring(0, codepoints); 2920 } 2921 2922 /** 2923 * If the given string is of length {@code maxLength} or less, then it is 2924 * returned as is. 2925 * If the string is longer than {@code maxLength}, the returned string is 2926 * truncated before the last space character on or before 2927 * {@code source.charAt(maxLength)}. If the string has no spaces, the 2928 * returned string is truncated to {@code maxLength}. 2929 * 2930 * @param source the string to truncate if necessary 2931 * @param maxLength 2932 * @return the original string if its length is less than or equal to 2933 * maxLength, otherwise a truncated string as mentioned above 2934 */ truncateIfNecessary(String source, int maxLength)2935 public static String truncateIfNecessary(String source, int maxLength) { 2936 if (source.length() <= maxLength) { 2937 return source; 2938 } 2939 String str = unicodePreservingSubstring(source, 0, maxLength); 2940 2941 @SuppressWarnings("deprecation") // we'll make this go away before that does 2942 CharMatcher whitespaceMatcher = CharMatcher.LEGACY_WHITESPACE; 2943 String truncated = whitespaceMatcher.trimTrailingFrom(str); 2944 2945 // We may have had multiple spaces at maxLength, which were stripped away 2946 if (truncated.length() < maxLength) { 2947 return truncated; 2948 } 2949 // We have a truncated string of length maxLength. If the next char was a 2950 // space, we truncated at a word boundary, so we can return immediately 2951 if (Character.isSpaceChar(source.charAt(maxLength))) { 2952 return truncated; 2953 } 2954 // We truncated in the middle of the word. Try to truncate before 2955 // the last space, if it exists. Otherwise, return the truncated string 2956 for (int i = truncated.length() - 1; i >= 0; --i) { 2957 if (Character.isSpaceChar(truncated.charAt(i))) { 2958 String substr = truncated.substring(0, i); 2959 return whitespaceMatcher.trimTrailingFrom(substr); 2960 } 2961 } 2962 return truncated; 2963 } 2964 2965 /** 2966 * If this given string is of length {@code maxLength} or less, it will 2967 * be returned as-is. 2968 * Otherwise it will be trucated to {@code maxLength}, regardless of whether 2969 * there are any space characters in the String. If an ellipsis is requested 2970 * to be appended to the truncated String, the String will be truncated so 2971 * that the ellipsis will also fit within maxLength. 2972 * If no truncation was necessary, no ellipsis will be added. 2973 * 2974 * @param source the String to truncate if necessary 2975 * @param maxLength the maximum number of characters to keep 2976 * @param addEllipsis if true, and if the String had to be truncated, 2977 * add "..." to the end of the String before returning. Additionally, 2978 * the ellipsis will only be added if maxLength is greater than 3. 2979 * @return the original string if its length is less than or equal to 2980 * maxLength, otherwise a truncated string as mentioned above 2981 */ truncateAtMaxLength(String source, int maxLength, boolean addEllipsis)2982 public static String truncateAtMaxLength(String source, int maxLength, 2983 boolean addEllipsis) { 2984 2985 if (source.length() <= maxLength) { 2986 return source; 2987 } 2988 if (addEllipsis && maxLength > 3) { 2989 return unicodePreservingSubstring(source, 0, maxLength - 3) + "..."; 2990 } 2991 return unicodePreservingSubstring(source, 0, maxLength); 2992 } 2993 2994 /** 2995 * Normalizes {@code index} such that it respects Unicode character 2996 * boundaries in {@code str}. 2997 * 2998 * <p>If {@code index} is the low surrogate of a unicode character, 2999 * the method returns {@code index - 1}. Otherwise, {@code index} is 3000 * returned. 3001 * 3002 * <p>In the case in which {@code index} falls in an invalid surrogate pair 3003 * (e.g. consecutive low surrogates, consecutive high surrogates), or if 3004 * if it is not a valid index into {@code str}, the original value of 3005 * {@code index} is returned. 3006 * 3007 * @param str the String 3008 * @param index the index to be normalized 3009 * @return a normalized index that does not split a Unicode character 3010 */ unicodePreservingIndex(String str, int index)3011 public static int unicodePreservingIndex(String str, int index) { 3012 if (index > 0 && index < str.length()) { 3013 if (Character.isHighSurrogate(str.charAt(index - 1)) && 3014 Character.isLowSurrogate(str.charAt(index))) { 3015 return index - 1; 3016 } 3017 } 3018 return index; 3019 } 3020 3021 /** 3022 * Returns a substring of {@code str} that respects Unicode character 3023 * boundaries. 3024 * 3025 * <p>The string will never be split between a [high, low] surrogate pair, 3026 * as defined by {@link Character#isHighSurrogate} and 3027 * {@link Character#isLowSurrogate}. 3028 * 3029 * <p>If {@code begin} or {@code end} are the low surrogate of a unicode 3030 * character, it will be offset by -1. 3031 * 3032 * <p>This behavior guarantees that 3033 * {@code str.equals(StringUtil.unicodePreservingSubstring(str, 0, n) + 3034 * StringUtil.unicodePreservingSubstring(str, n, str.length())) } is 3035 * true for all {@code n}. 3036 * </pre> 3037 * 3038 * <p>This means that unlike {@link String#substring(int, int)}, the length of 3039 * the returned substring may not necessarily be equivalent to 3040 * {@code end - begin}. 3041 * 3042 * @param str the original String 3043 * @param begin the beginning index, inclusive 3044 * @param end the ending index, exclusive 3045 * @return the specified substring, possibly adjusted in order to not 3046 * split unicode surrogate pairs 3047 * @throws IndexOutOfBoundsException if the {@code begin} is negative, 3048 * or {@code end} is larger than the length of {@code str}, or 3049 * {@code begin} is larger than {@code end} 3050 */ unicodePreservingSubstring( String str, int begin, int end)3051 public static String unicodePreservingSubstring( 3052 String str, int begin, int end) { 3053 return str.substring(unicodePreservingIndex(str, begin), 3054 unicodePreservingIndex(str, end)); 3055 } 3056 3057 /** 3058 * Equivalent to: 3059 * 3060 * <pre> 3061 * {@link #unicodePreservingSubstring(String, int, int)}( 3062 * str, begin, str.length()) 3063 * </pre> 3064 */ unicodePreservingSubstring(String str, int begin)3065 public static String unicodePreservingSubstring(String str, int begin) { 3066 return unicodePreservingSubstring(str, begin, str.length()); 3067 } 3068 3069 /** 3070 * True iff the given character needs to be escaped in a javascript string 3071 * literal. 3072 * <p> 3073 * We need to escape the following characters in javascript string literals. 3074 * <dl> 3075 * <dt> \ <dd> the escape character 3076 * <dt> ', " <dd> string delimiters. 3077 * TODO(msamuel): what about backticks (`) which are 3078 * non-standard but recognized as attribute delimiters. 3079 * <dt> &, <, >, = <dd> so that a string literal can be embedded in XHTML 3080 * without further escaping. 3081 * </dl> 3082 * TODO(msamuel): If we're being paranoid, should we escape + to avoid UTF-7 3083 * attacks? 3084 * <p> 3085 * Unicode format control characters (category Cf) must be escaped since they 3086 * are removed by javascript parser in a pre-lex pass. 3087 * <br>According to EcmaScript 262 Section 7.1: 3088 * <blockquote> 3089 * The format control characters can occur anywhere in the source text of 3090 * an ECMAScript program. These characters are removed from the source 3091 * text before applying the lexical grammar. 3092 * </blockquote> 3093 * <p> 3094 * Additionally, line terminators are not allowed to appear inside strings 3095 * and Section 7.3 says 3096 * <blockquote> 3097 * The following characters are considered to be line terminators:<pre> 3098 * Code Point Value Name Formal Name 3099 * \u000A Line Feed [LF] 3100 * \u000D Carriage Return [CR] 3101 * \u2028 Line separator [LS] 3102 * \u2029 Paragraph separator [PS] 3103 * </pre></blockquote> 3104 * 3105 * @param codepoint a char instead of an int since the javascript language 3106 * does not support extended unicode. 3107 */ mustEscapeCharInJsString(int codepoint)3108 static boolean mustEscapeCharInJsString(int codepoint) { 3109 return JS_ESCAPE_CHARS.contains(codepoint); 3110 } 3111 3112 /** 3113 * True iff the given character needs to be escaped in a JSON string literal. 3114 * <p> 3115 * We need to escape the following characters in JSON string literals. 3116 * <dl> 3117 * <dt> \ <dd> the escape character 3118 * <dt> " <dd> string delimiter 3119 * <dt> 0x00 - 0x1F <dd> control characters 3120 * </dl> 3121 * <p> 3122 * See EcmaScript 262 Section 15.12.1 for the full JSON grammar. 3123 */ mustEscapeCharInJsonString(int codepoint)3124 static boolean mustEscapeCharInJsonString(int codepoint) { 3125 return JSON_ESCAPE_CHARS.contains(codepoint); 3126 } 3127 3128 /** 3129 * Builds a small set of code points. 3130 * {@code com.google.common.base} cannot depend on ICU4J, thus avoiding ICU's 3131 * {@code UnicodeSet}. 3132 * For all other purposes, please use {@code com.ibm.icu.text.UnicodeSet}. 3133 */ 3134 private static class UnicodeSetBuilder { 3135 Set<Integer> codePointSet = new HashSet<Integer>(); 3136 addCodePoint(int c)3137 UnicodeSetBuilder addCodePoint(int c) { 3138 codePointSet.add(c); 3139 return this; 3140 } 3141 addRange(int from, int to)3142 UnicodeSetBuilder addRange(int from, int to) { 3143 for (int i = from; i <= to; i++) { 3144 codePointSet.add(i); 3145 } 3146 return this; 3147 } 3148 create()3149 Set<Integer> create() { 3150 return codePointSet; 3151 } 3152 } 3153 3154 private static final Set<Integer> JS_ESCAPE_CHARS = new UnicodeSetBuilder() 3155 // All characters in the class of format characters, [:Cf:]. 3156 // Source: http://unicode.org/cldr/utility/list-unicodeset.jsp. 3157 .addCodePoint(0xAD) 3158 .addRange(0x600, 0x603) 3159 .addCodePoint(0x6DD) 3160 .addCodePoint(0x070F) 3161 .addRange(0x17B4, 0x17B5) 3162 .addRange(0x200B, 0x200F) 3163 .addRange(0x202A, 0x202E) 3164 .addRange(0x2060, 0x2064) 3165 .addRange(0x206A, 0x206F) 3166 .addCodePoint(0xFEFF) 3167 .addRange(0xFFF9, 0xFFFB) 3168 .addRange(0x0001D173, 0x0001D17A) 3169 .addCodePoint(0x000E0001) 3170 .addRange(0x000E0020, 0x000E007F) 3171 // Plus characters mentioned in the docs of mustEscapeCharInJsString(). 3172 .addCodePoint(0x0000) 3173 .addCodePoint(0x000A) 3174 .addCodePoint(0x000D) 3175 .addRange(0x2028, 0x2029) 3176 .addCodePoint(0x0085) 3177 .addCodePoint(Character.codePointAt("'", 0)) 3178 .addCodePoint(Character.codePointAt("\"", 0)) 3179 .addCodePoint(Character.codePointAt("&", 0)) 3180 .addCodePoint(Character.codePointAt("<", 0)) 3181 .addCodePoint(Character.codePointAt(">", 0)) 3182 .addCodePoint(Character.codePointAt("=", 0)) 3183 .addCodePoint(Character.codePointAt("\\", 0)) 3184 .create(); 3185 3186 private static final Set<Integer> JSON_ESCAPE_CHARS = new UnicodeSetBuilder() 3187 .addCodePoint(Character.codePointAt("\"", 0)) 3188 .addCodePoint(Character.codePointAt("\\", 0)) 3189 .addRange(0x0000, 0x001F) 3190 .create(); 3191 3192 /** 3193 * <b>To be deprecated:</b> use {@link CharEscapers#xmlEscaper()} instead. 3194 */ xmlEscape(String s)3195 public static String xmlEscape(String s) { 3196 return CharEscapers.xmlEscaper().escape(s); 3197 } 3198 3199 /** 3200 * <b>To be deprecated:</b> use {@link CharEscapers#asciiHtmlEscaper()} instead. 3201 */ htmlEscape(String s)3202 public static String htmlEscape(String s) { 3203 return CharEscapers.asciiHtmlEscaper().escape(s); 3204 } 3205 }