1 /** 2 * Copyright (c) 2000, Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.android.mail.common.base; 18 19 import static com.google.android.mail.common.base.Preconditions.checkArgument; 20 21 import com.google.common.base.Joiner; 22 import com.google.common.base.Joiner.MapJoiner; 23 24 import java.io.IOException; 25 import java.io.InputStream; 26 import java.io.StringWriter; 27 import java.util.ArrayList; 28 import java.util.Collection; 29 import java.util.Collections; 30 import java.util.HashMap; 31 import java.util.HashSet; 32 import java.util.Iterator; 33 import java.util.LinkedHashMap; 34 import java.util.LinkedList; 35 import java.util.List; 36 import java.util.Map; 37 import java.util.Set; 38 import java.util.StringTokenizer; 39 import java.util.regex.Matcher; 40 import java.util.regex.Pattern; 41 42 /** 43 * Static utility methods and constants pertaining to {@code String} or {@code 44 * CharSequence} instances. 45 */ 46 public final class StringUtil { StringUtil()47 private StringUtil() {} // COV_NF_LINE 48 49 /** 50 * A completely arbitrary selection of eight whitespace characters. See 51 * <a href="http://go/white+space">this spreadsheet</a> for more details 52 * about whitespace characters. 53 * 54 * @deprecated Rewrite your code to use {@link CharMatcher#WHITESPACE}, or 55 * consider the precise set of characters you want to match and construct 56 * the right explicit {@link CharMatcher} or {@link String} for your own 57 * purposes. 58 */ 59 @Deprecated 60 public static final String WHITE_SPACES = " \r\n\t\u3000\u00A0\u2007\u202F"; 61 62 /** A string containing the carriage return and linefeed characters. */ 63 public static final String LINE_BREAKS = "\r\n"; 64 65 /** 66 * Old location of {@link Strings#isNullOrEmpty}; this method will be 67 * deprecated soon. 68 */ isEmpty(String string)69 public static boolean isEmpty(String string) { 70 return Strings.isNullOrEmpty(string); 71 } 72 73 /** 74 * Returns {@code true} if the given string is null, empty, or comprises only 75 * whitespace characters, as defined by {@link CharMatcher#WHITESPACE}. 76 * 77 * <p><b>Warning:</b> there are many competing definitions of "whitespace"; 78 * please see <a href="http://go/white+space">this spreadsheet</a> for 79 * details. 80 * 81 * @param string the string reference to check 82 * @return {@code true} if {@code string} is null, empty, or consists of 83 * whitespace characters only 84 */ isEmptyOrWhitespace(String string)85 public static boolean isEmptyOrWhitespace(String string) { 86 return string == null || CharMatcher.WHITESPACE.matchesAllOf(string); 87 } 88 89 /** 90 * Old location of {@link Strings#nullToEmpty}; this method will be 91 * deprecated soon. 92 */ makeSafe(String string)93 public static String makeSafe(String string) { 94 return Strings.nullToEmpty(string); 95 } 96 97 /** 98 * Old location of {@link Strings#emptyToNull}; this method will be 99 * deprecated soon. 100 */ toNullIfEmpty(String string)101 public static String toNullIfEmpty(String string) { 102 return Strings.emptyToNull(string); 103 } 104 105 /** 106 * Returns the given string if it is nonempty and contains at least one 107 * non-whitespace character; {@code null} otherwise. See comment in {@link 108 * #isEmptyOrWhitespace} on the definition of whitespace. 109 * 110 * @param string the string to test and possibly return 111 * @return {@code null} if {@code string} is null, empty, or contains only 112 * whitespace characters; {@code string} itself otherwise 113 */ toNullIfEmptyOrWhitespace( String string)114 public static String toNullIfEmptyOrWhitespace( 115 String string) { 116 return isEmptyOrWhitespace(string) ? null : string; 117 } 118 119 /** 120 * Old location of {@link Strings#repeat}; this method will be deprecated 121 * soon. 122 */ repeat(String string, int count)123 public static String repeat(String string, int count) { 124 return Strings.repeat(string, count); 125 } 126 127 /** 128 * Return the first index in the string of any of the specified characters, 129 * starting at a given index, or {@code -1} if none of the characters is 130 * present. 131 * 132 * @param string the non-null character sequence to look in 133 * @param chars a non-null character sequence containing the set of characters 134 * to look for. If empty, this method will find no matches and return 135 * {@code -1} 136 * @param fromIndex the index of the first character to examine in the input 137 * string. If negative, the entire string will be searched. If greater 138 * than or equal to the string length, no characters will be searched and 139 * {@code -1} will be returned. 140 * @return the index of the first match, or {@code -1} if no match was found. 141 * Guaranteed to be either {@code -1} or a number greater than or equal to 142 * {@code fromIndex} 143 * @throws NullPointerException if any argument is null 144 */ 145 // author: pault indexOfChars( CharSequence string, CharSequence chars, int fromIndex)146 public static int indexOfChars( 147 CharSequence string, CharSequence chars, int fromIndex) { 148 if (fromIndex >= string.length()) { 149 return -1; 150 } 151 152 /* 153 * Prepare lookup structures for the characters. TODO(pault): This loop 154 * could be factored into another method to allow caching of the resulting 155 * struct if a use-case of very large character sets exists. 156 */ 157 Set<Character> charSet = Collections.emptySet(); 158 boolean[] charArray = new boolean[128]; 159 for (int i = 0; i < chars.length(); i++) { 160 char c = chars.charAt(i); 161 if (c < 128) { 162 charArray[c] = true; 163 } else { 164 if (charSet.isEmpty()) { 165 charSet = new HashSet<Character>(); 166 } 167 charSet.add(c); 168 } 169 } 170 171 // Scan the string for matches 172 for (int i = Math.max(fromIndex, 0); i < string.length(); i++) { 173 char c = string.charAt(i); 174 if (c < 128) { 175 if (charArray[c]) { 176 return i; 177 } 178 } else if (charSet.contains(c)) { 179 return i; 180 } 181 } 182 return -1; 183 } 184 185 /* 186 * ------------------------------------------------------------------- 187 * This marks the end of the code that has been written or rewritten 188 * in 2008 to the quality standards of the Java core libraries group. 189 * Code below this point is still awaiting cleanup (you can help!). 190 * See http://wiki/Nonconf/JavaCoreLibrariesStandards. 191 * ------------------------------------------------------------------- 192 */ 193 194 195 /** 196 * @param str the string to split. Must not be null. 197 * @param delims the delimiter characters. Each character in the 198 * string is individually treated as a delimiter. 199 * @return an array of tokens. Will not return null. Individual tokens 200 * do not have leading/trailing whitespace removed. 201 * @deprecated see the detailed instructions under 202 * {@link #split(String, String, boolean)} 203 */ 204 @Deprecated split(String str, String delims)205 public static String[] split(String str, String delims) { 206 return split(str, delims, false); 207 } 208 209 /** 210 * This method is deprecated because it is too inflexible, providing 211 * only a very specific set of behaviors that almost never matches exactly 212 * what you intend. Prefer using a {@link Splitter}, which is more flexible 213 * and consistent in the way it handles trimming and empty tokens. 214 * 215 * <ul> 216 * <li>Create a {@link Splitter} using {@link Splitter#on(CharMatcher)} such 217 * as {@code Splitter.on(CharMatcher.anyOf(delims))}. 218 * <li><i>If</i> you need whitespace trimmed from the ends of each segment, 219 * adding {@code .trimResults()} to your splitter definition should work 220 * in most cases. To match the exact behavior of this method, use 221 * {@code .trimResults(CharMatcher.inRange('\0', ' '))}. 222 * <li>This method silently ignores empty tokens in the input, but allows 223 * empty tokens to appear in the output if {@code trimTokens} is 224 * {@code true}. Adding {@code .omitEmptyStrings()} to your splitter 225 * definition will filter empty tokens out but will do so <i>after</i> 226 * having performed trimming. If you absolutely require this method's 227 * behavior in this respect, Splitter is not able to match it. 228 * <li>If you need the result as an array, use {@link 229 * com.google.common.collect.Iterables#toArray(Iterable, Class)} on the 230 * {@code Iterable<String>} returned by {@link Splitter#split}. 231 * </ul> 232 * 233 * @param str the string to split. Must not be null. 234 * @param delims the delimiter characters. Each character in the string 235 * is individually treated as a delimiter. 236 * @param trimTokens if true, leading/trailing whitespace is removed 237 * from the tokens 238 * @return an array of tokens. Will not return null. 239 * @deprecated 240 */ 241 @Deprecated split( String str, String delims, boolean trimTokens)242 public static String[] split( 243 String str, String delims, boolean trimTokens) { 244 StringTokenizer tokenizer = new StringTokenizer(str, delims); 245 int n = tokenizer.countTokens(); 246 String[] list = new String[n]; 247 for (int i = 0; i < n; i++) { 248 if (trimTokens) { 249 list[i] = tokenizer.nextToken().trim(); 250 } else { 251 list[i] = tokenizer.nextToken(); 252 } 253 } 254 return list; 255 } 256 257 /** 258 * Trim characters from only the beginning of a string. 259 * This is a convenience method, it simply calls trimStart(s, null). 260 * 261 * @param s String to be trimmed 262 * @return String with whitespace characters removed from the beginning 263 */ trimStart(String s)264 public static String trimStart(String s) { 265 return trimStart(s, null); 266 } 267 268 /** 269 * Trim characters from only the beginning of a string. 270 * This method will remove all whitespace characters 271 * (defined by Character.isWhitespace(char), in addition to the characters 272 * provided, from the end of the provided string. 273 * 274 * @param s String to be trimmed 275 * @param extraChars Characters in addition to whitespace characters that 276 * should be trimmed. May be null. 277 * @return String with whitespace and characters in extraChars removed 278 * from the beginning 279 */ trimStart(String s, String extraChars)280 public static String trimStart(String s, String extraChars) { 281 int trimCount = 0; 282 while (trimCount < s.length()) { 283 char ch = s.charAt(trimCount); 284 if (Character.isWhitespace(ch) 285 || (extraChars != null && extraChars.indexOf(ch) >= 0)) { 286 trimCount++; 287 } else { 288 break; 289 } 290 } 291 292 if (trimCount == 0) { 293 return s; 294 } 295 return s.substring(trimCount); 296 } 297 298 /** 299 * Trim characters from only the end of a string. 300 * This is a convenience method, it simply calls trimEnd(s, null). 301 * 302 * @param s String to be trimmed 303 * @return String with whitespace characters removed from the end 304 */ trimEnd(String s)305 public static String trimEnd(String s) { 306 return trimEnd(s, null); 307 } 308 309 /** 310 * Trim characters from only the end of a string. 311 * This method will remove all whitespace characters 312 * (defined by Character.isWhitespace(char), in addition to the characters 313 * provided, from the end of the provided string. 314 * 315 * @param s String to be trimmed 316 * @param extraChars Characters in addition to whitespace characters that 317 * should be trimmed. May be null. 318 * @return String with whitespace and characters in extraChars removed 319 * from the end 320 */ trimEnd(String s, String extraChars)321 public static String trimEnd(String s, String extraChars) { 322 int trimCount = 0; 323 while (trimCount < s.length()) { 324 char ch = s.charAt(s.length() - trimCount - 1); 325 if (Character.isWhitespace(ch) 326 || (extraChars != null && extraChars.indexOf(ch) >= 0)) { 327 trimCount++; 328 } else { 329 break; 330 } 331 } 332 333 if (trimCount == 0) { 334 return s; 335 } 336 return s.substring(0, s.length() - trimCount); 337 } 338 339 /** 340 * @param str the string to split. Must not be null. 341 * @param delims the delimiter characters. Each character in the 342 * string is individually treated as a delimiter. 343 * @return an array of tokens. Will not return null. Leading/trailing 344 * whitespace is removed from the tokens. 345 * @deprecated see the detailed instructions under 346 * {@link #split(String, String, boolean)} 347 */ 348 @Deprecated splitAndTrim(String str, String delims)349 public static String[] splitAndTrim(String str, String delims) { 350 return split(str, delims, true); 351 } 352 353 /** Parse comma-separated list of ints and return as array. */ splitInts(String str)354 public static int[] splitInts(String str) throws IllegalArgumentException { 355 StringTokenizer tokenizer = new StringTokenizer(str, ","); 356 int n = tokenizer.countTokens(); 357 int[] list = new int[n]; 358 for (int i = 0; i < n; i++) { 359 String token = tokenizer.nextToken(); 360 list[i] = Integer.parseInt(token); 361 } 362 return list; 363 } 364 365 /** Parse comma-separated list of longs and return as array. */ splitLongs(String str)366 public static long[] splitLongs(String str) throws IllegalArgumentException { 367 StringTokenizer tokenizer = new StringTokenizer(str, ","); 368 int n = tokenizer.countTokens(); 369 long[] list = new long[n]; 370 for (int i = 0; i < n; i++) { 371 String token = tokenizer.nextToken(); 372 list[i] = Long.parseLong(token); 373 } 374 return list; 375 } 376 377 /** This replaces the occurrences of 'what' in 'str' with 'with' 378 * 379 * @param str the string to process 380 * @param what to replace 381 * @param with replace with this 382 * @return String str where 'what' was replaced with 'with' 383 * 384 * @deprecated Please use {@link String#replace(CharSequence, CharSequence)}. 385 */ 386 @Deprecated replace( String str, CharSequence what, CharSequence with)387 public static String replace( 388 String str, CharSequence what, CharSequence with) { 389 // Have to check this argument, for compatibility with the old impl. 390 // For the record, String.replace() is capable of handling an empty target 391 // string... but it does something kind of weird in that case. 392 checkArgument(what.length() > 0); 393 return str.replace(what, with); 394 } 395 396 private static final Splitter NEWLINE_SPLITTER = 397 Splitter.on('\n').omitEmptyStrings(); 398 399 /** 400 * Reformats the given string to a fixed width by inserting carriage returns 401 * and trimming unnecessary whitespace. See 402 * {@link #fixedWidth(String[], int)} for details. The {@code str} argument 403 * to this method will be split on newline characters ({@code '\n'}) only 404 * (regardless of platform). An array of resulting non-empty strings is 405 * then passed to {@link #fixedWidth(String[], int)} as the {@code lines} 406 * parameter. 407 * 408 * @param str the string to format 409 * @param width the fixed width (in characters) 410 */ fixedWidth(String str, int width)411 public static String fixedWidth(String str, int width) { 412 List<String> lines = new ArrayList<String>(); 413 414 for (String line : NEWLINE_SPLITTER.split(str)) { 415 lines.add(line); 416 } 417 418 String[] lineArray = lines.toArray(new String[0]); 419 return fixedWidth(lineArray, width); 420 } 421 422 /** 423 * Reformats the given array of lines to a fixed width by inserting 424 * newlines and trimming unnecessary whitespace. This uses simple 425 * whitespace-based splitting, not sophisticated internationalized 426 * line breaking. Newlines within a line are treated like any other 427 * whitespace. Lines which are already short enough will be passed 428 * through unmodified. 429 * 430 * <p>Only breaking whitespace characters (those which match 431 * {@link CharMatcher#BREAKING_WHITESPACE}) are treated as whitespace by 432 * this method. Non-breaking whitespace characters will be considered as 433 * ordinary characters which are connected to any other adjacent 434 * non-whitespace characters, and will therefore appear in the returned 435 * string in their original context. 436 * 437 * @param lines array of lines to format 438 * @param width the fixed width (in characters) 439 */ fixedWidth(String[] lines, int width)440 public static String fixedWidth(String[] lines, int width) { 441 List<String> formattedLines = new ArrayList<String>(); 442 443 for (String line : lines) { 444 formattedLines.add(formatLineToFixedWidth(line, width)); 445 } 446 447 return Joiner.on('\n').join(formattedLines); 448 } 449 450 private static final Splitter TO_WORDS = 451 Splitter.on(CharMatcher.BREAKING_WHITESPACE).omitEmptyStrings(); 452 453 /** 454 * Helper method for {@link #fixedWidth(String[], int)} 455 */ formatLineToFixedWidth(String line, int width)456 private static String formatLineToFixedWidth(String line, int width) { 457 if (line.length() <= width) { 458 return line; 459 } 460 461 StringBuilder builder = new StringBuilder(); 462 int col = 0; 463 464 for (String word : TO_WORDS.split(line)) { 465 if (col == 0) { 466 col = word.length(); 467 } else { 468 int newCol = col + word.length() + 1; // +1 for the space 469 470 if (newCol <= width) { 471 builder.append(' '); 472 col = newCol; 473 } else { 474 builder.append('\n'); 475 col = word.length(); 476 } 477 } 478 479 builder.append(word); 480 } 481 482 return builder.toString(); 483 } 484 485 /** 486 * Splits the argument original into a list of substrings. All the 487 * substrings in the returned list (except possibly the last) will 488 * have length lineLen. 489 * 490 * @param lineLen the length of the substrings to put in the list 491 * @param original the original string 492 * 493 * @return a list of strings of length lineLen that together make up the 494 * original string 495 * @deprecated use {@code Splitter.fixedLength(lineLen).split(original))} 496 * (note that it returns an {@code Iterable}, not a {@code List}) 497 */ 498 @Deprecated fixedSplit(String original, int lineLen)499 public static List<String> fixedSplit(String original, int lineLen) { 500 List<String> output = new ArrayList<String>(); 501 for (String elem : Splitter.fixedLength(lineLen).split(original)) { 502 output.add(elem); 503 } 504 return output; 505 } 506 507 /** 508 * Indents the given String per line. 509 * @param iString the string to indent 510 * @param iIndentDepth the depth of the indentation 511 * @return the indented string 512 */ indent(String iString, int iIndentDepth)513 public static String indent(String iString, int iIndentDepth) { 514 StringBuilder spacer = new StringBuilder(); 515 spacer.append("\n"); 516 for (int i = 0; i < iIndentDepth; i++) { 517 spacer.append(" "); 518 } 519 return iString.replace("\n", spacer.toString()); 520 } 521 522 /** 523 * This is a both way strip. 524 * 525 * @param str the string to strip 526 * @param left strip from left 527 * @param right strip from right 528 * @param what character(s) to strip 529 * @return the stripped string 530 * @deprecated ensure the string is not null and use 531 * <ul> 532 * <li> {@code CharMatcher.anyOf(what).trimFrom(str)} 533 * if {@code left == true} and {@code right == true} 534 * <li> {@code CharMatcher.anyOf(what).trimLeadingFrom(str)} 535 * if {@code left == true} and {@code right == false} 536 * <li> {@code CharMatcher.anyOf(what).trimTrailingFrom(str)} 537 * if {@code left == false} and {@code right == true} 538 * </ul> 539 */ 540 @Deprecated megastrip(String str, boolean left, boolean right, String what)541 public static String megastrip(String str, 542 boolean left, boolean right, 543 String what) { 544 if (str == null) { 545 return null; 546 } 547 548 CharMatcher matcher = CharMatcher.anyOf(what); 549 if (left) { 550 if (right) { 551 return matcher.trimFrom(str); 552 } 553 return matcher.trimLeadingFrom(str); 554 } 555 if (right) { 556 return matcher.trimTrailingFrom(str); 557 } 558 return str; 559 } 560 561 /** strip - strips both ways 562 * 563 * @param str what to strip 564 * @return String the striped string 565 * @deprecated ensure the string is not null and use {@code 566 * CharMatcher.LEGACY_WHITESPACE.trimFrom(str)}; also consider whether you 567 * really want the legacy whitespace definition, or something more 568 * standard like {@link CharMatcher#WHITESPACE}. 569 */ 570 @SuppressWarnings("deprecation") // this is deprecated itself strip(String str)571 @Deprecated public static String strip(String str) { 572 return (str == null) ? null : CharMatcher.LEGACY_WHITESPACE.trimFrom(str); 573 } 574 575 /** Strip white spaces from both end, and collapse white spaces 576 * in the middle. 577 * 578 * @param str what to strip 579 * @return String the striped and collapsed string 580 * @deprecated ensure the string is not null and use {@code 581 * CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ')}; also 582 * consider whether you really want the legacy whitespace definition, or 583 * something more standard like {@link CharMatcher#WHITESPACE}. 584 */ 585 @SuppressWarnings("deprecation") // this is deprecated itself stripAndCollapse(String str)586 @Deprecated public static String stripAndCollapse(String str) { 587 return (str == null) ? null 588 : CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' '); 589 } 590 591 /** 592 * Give me a string and a potential prefix, and I return the string 593 * following the prefix if the prefix matches, else null. 594 * Analogous to the c++ functions strprefix and var_strprefix. 595 * 596 * @param str the string to strip 597 * @param prefix the expected prefix 598 * @return the stripped string or <code>null</code> if the string 599 * does not start with the prefix 600 */ stripPrefix(String str, String prefix)601 public static String stripPrefix(String str, String prefix) { 602 return str.startsWith(prefix) 603 ? str.substring(prefix.length()) 604 : null; 605 } 606 607 /** 608 * Case insensitive version of stripPrefix. Strings are compared in 609 * the same way as in {@link String#equalsIgnoreCase}. 610 * Analogous to the c++ functions strcaseprefix and var_strcaseprefix. 611 * 612 * @param str the string to strip 613 * @param prefix the expected prefix 614 * @return the stripped string or <code>null</code> if the string 615 * does not start with the prefix 616 */ stripPrefixIgnoreCase(String str, String prefix)617 public static String stripPrefixIgnoreCase(String str, String prefix) { 618 return startsWithIgnoreCase(str, prefix) 619 ? str.substring(prefix.length()) 620 : null; 621 } 622 623 /** 624 * Give me a string and a potential suffix, and I return the string 625 * before the suffix if the suffix matches, else null. 626 * Analogous to the c++ function strsuffix. 627 * 628 * @param str the string to strip 629 * @param suffix the expected suffix 630 * @return the stripped string or <code>null</code> if the string 631 * does not end with the suffix 632 */ stripSuffix(String str, String suffix)633 public static String stripSuffix(String str, String suffix) { 634 return str.endsWith(suffix) 635 ? str.substring(0, str.length() - suffix.length()) 636 : null; 637 } 638 639 /** 640 * Case insensitive version of stripSuffix. Strings are compared in 641 * the same way as in {@link String#equalsIgnoreCase}. 642 * Analogous to the c++ function strcasesuffix. 643 * 644 * @param str the string to strip 645 * @param suffix the expected suffix 646 * @return the stripped string or <code>null</code> if the string 647 * does not end with the suffix 648 */ stripSuffixIgnoreCase( String str, String suffix)649 public static String stripSuffixIgnoreCase( 650 String str, String suffix) { 651 return endsWithIgnoreCase(str, suffix) 652 ? str.substring(0, str.length() - suffix.length()) 653 : null; 654 } 655 656 /** 657 * Strips all non-digit characters from a string. 658 * 659 * The resulting string will only contain characters for which isDigit() 660 * returns true. 661 * 662 * @param str the string to strip 663 * @return a string consisting of digits only, or an empty string 664 * @deprecated use {@code CharMatcher.JAVA_DIGIT.retainFrom(str)} (also 665 * consider whether this is really the definition of "digit" you wish to 666 * use) 667 */ stripNonDigits(String str)668 @Deprecated public static String stripNonDigits(String str) { 669 return CharMatcher.JAVA_DIGIT.retainFrom(str); 670 } 671 672 /** 673 * Finds the last index in str of a character not in the characters 674 * in 'chars' (similar to ANSI string.find_last_not_of). 675 * 676 * Returns -1 if no such character can be found. 677 * 678 * <p><b>Note:</b> If {@code fromIndex} is zero, use {@link CharMatcher} 679 * instead for this: {@code CharMatcher.noneOf(chars).lastIndexIn(str)}. 680 */ 681 // TODO(kevinb): after adding fromIndex versions of (last)IndexOf to 682 // CharMatcher, deprecate this lastIndexNotOf(String str, String chars, int fromIndex)683 public static int lastIndexNotOf(String str, String chars, int fromIndex) { 684 fromIndex = Math.min(fromIndex, str.length() - 1); 685 686 for (int pos = fromIndex; pos >= 0; pos--) { 687 if (chars.indexOf(str.charAt(pos)) < 0) { 688 return pos; 689 } 690 } 691 692 return -1; 693 } 694 695 /** 696 * Like String.replace() except that it accepts any number of old chars. 697 * Replaces any occurrances of 'oldchars' in 'str' with 'newchar'. 698 * Example: replaceChars("Hello, world!", "H,!", ' ') returns " ello world " 699 * 700 * @deprecated use {@code CharMatcher#replaceFrom(String, char)}, for example 701 * {@code CharMatcher.anyOf(oldchars).replaceFrom(str, newchar)} 702 */ replaceChars( String str, CharSequence oldchars, char newchar)703 @Deprecated public static String replaceChars( 704 String str, CharSequence oldchars, char newchar) { 705 return CharMatcher.anyOf(oldchars).replaceFrom(str, newchar); 706 } 707 708 /** 709 * Remove any occurrances of 'oldchars' in 'str'. 710 * Example: removeChars("Hello, world!", ",!") returns "Hello world" 711 * 712 * @deprecated use {@link CharMatcher#removeFrom(CharSequence)}, for example 713 * {@code CharMatcher.anyOf(oldchars).removeFrom(str)} 714 */ removeChars( String str, CharSequence oldchars)715 @Deprecated public static String removeChars( 716 String str, CharSequence oldchars) { 717 return CharMatcher.anyOf(oldchars).removeFrom(str); 718 } 719 720 // See http://www.microsoft.com/typography/unicode/1252.htm 721 private static final CharMatcher FANCY_SINGLE_QUOTE 722 = CharMatcher.anyOf("\u0091\u0092\u2018\u2019"); 723 private static final CharMatcher FANCY_DOUBLE_QUOTE 724 = CharMatcher.anyOf("\u0093\u0094\u201c\u201d"); 725 726 /** 727 * Replaces microsoft "smart quotes" (curly " and ') with their 728 * ascii counterparts. 729 */ replaceSmartQuotes(String str)730 public static String replaceSmartQuotes(String str) { 731 String tmp = FANCY_SINGLE_QUOTE.replaceFrom(str, '\''); 732 return FANCY_DOUBLE_QUOTE.replaceFrom(tmp, '"'); 733 } 734 735 /** 736 * Convert a string of hex digits to a byte array, with the first 737 * byte in the array being the MSB. The string passed in should be 738 * just the raw digits (upper or lower case), with no leading 739 * or trailing characters (like '0x' or 'h'). 740 * An odd number of characters is supported. 741 * If the string is empty, an empty array will be returned. 742 * 743 * This is significantly faster than using 744 * new BigInteger(str, 16).toByteArray(); 745 * especially with larger strings. Here are the results of some 746 * microbenchmarks done on a P4 2.8GHz 2GB RAM running 747 * linux 2.4.22-gg11 and JDK 1.5 with an optimized build: 748 * 749 * String length hexToBytes (usec) BigInteger 750 * ----------------------------------------------------- 751 * 16 0.570 1.43 752 * 256 8.21 44.4 753 * 1024 32.8 526 754 * 16384 546 121000 755 */ hexToBytes(CharSequence str)756 public static byte[] hexToBytes(CharSequence str) { 757 byte[] bytes = new byte[(str.length() + 1) / 2]; 758 if (str.length() == 0) { 759 return bytes; 760 } 761 bytes[0] = 0; 762 int nibbleIdx = (str.length() % 2); 763 for (int i = 0; i < str.length(); i++) { 764 char c = str.charAt(i); 765 if (!isHex(c)) { 766 throw new IllegalArgumentException("string contains non-hex chars"); 767 } 768 if ((nibbleIdx % 2) == 0) { 769 bytes[nibbleIdx >> 1] = (byte) (hexValue(c) << 4); 770 } else { 771 bytes[nibbleIdx >> 1] += (byte) hexValue(c); 772 } 773 nibbleIdx++; 774 } 775 return bytes; 776 } 777 778 /** 779 * Converts any instances of "\r" or "\r\n" style EOLs into "\n" (Line Feed). 780 */ convertEOLToLF(String input)781 public static String convertEOLToLF(String input) { 782 StringBuilder res = new StringBuilder(input.length()); 783 char[] s = input.toCharArray(); 784 int from = 0; 785 final int end = s.length; 786 for (int i = 0; i < end; i++) { 787 if (s[i] == '\r') { 788 res.append(s, from, i - from); 789 res.append('\n'); 790 if (i + 1 < end && s[i + 1] == '\n') { 791 i++; 792 } 793 794 from = i + 1; 795 } 796 } 797 798 if (from == 0) { // no \r! 799 return input; 800 } 801 802 res.append(s, from, end - from); 803 return res.toString(); 804 } 805 806 /** 807 * Old location of {@link Strings#padStart}; this method will be deprecated 808 * soon. 809 */ padLeft(String s, int len, char padChar)810 public static String padLeft(String s, int len, char padChar) { 811 return Strings.padStart(s, len, padChar); 812 } 813 814 /** 815 * Old location of {@link Strings#padEnd}; this method will be deprecated 816 * soon. 817 */ padRight(String s, int len, char padChar)818 public static String padRight(String s, int len, char padChar) { 819 return Strings.padEnd(s, len, padChar); 820 } 821 822 /** 823 * Returns a string consisting of "s", with each of the first "len" characters 824 * replaced by "maskChar" character. 825 */ maskLeft(String s, int len, char maskChar)826 public static String maskLeft(String s, int len, char maskChar) { 827 if (len <= 0) { 828 return s; 829 } 830 len = Math.min(len, s.length()); 831 StringBuilder sb = new StringBuilder(); 832 for (int i = 0; i < len; i++) { 833 sb.append(maskChar); 834 } 835 sb.append(s.substring(len)); 836 return sb.toString(); 837 } 838 isOctal(char c)839 private static boolean isOctal(char c) { 840 return (c >= '0') && (c <= '7'); 841 } 842 isHex(char c)843 private static boolean isHex(char c) { 844 return ((c >= '0') && (c <= '9')) || 845 ((c >= 'a') && (c <= 'f')) || 846 ((c >= 'A') && (c <= 'F')); 847 } 848 hexValue(char c)849 private static int hexValue(char c) { 850 if ((c >= '0') && (c <= '9')) { 851 return (c - '0'); 852 } else if ((c >= 'a') && (c <= 'f')) { 853 return (c - 'a') + 10; 854 } else { 855 return (c - 'A') + 10; 856 } 857 } 858 859 /** 860 * Unescape any C escape sequences (\n, \r, \\, \ooo, etc) and return the 861 * resulting string. 862 */ unescapeCString(String s)863 public static String unescapeCString(String s) { 864 if (s.indexOf('\\') < 0) { 865 // Fast path: nothing to unescape 866 return s; 867 } 868 869 StringBuilder sb = new StringBuilder(); 870 int len = s.length(); 871 for (int i = 0; i < len;) { 872 char c = s.charAt(i++); 873 if (c == '\\' && (i < len)) { 874 c = s.charAt(i++); 875 switch (c) { 876 case 'a': c = '\007'; break; 877 case 'b': c = '\b'; break; 878 case 'f': c = '\f'; break; 879 case 'n': c = '\n'; break; 880 case 'r': c = '\r'; break; 881 case 't': c = '\t'; break; 882 case 'v': c = '\013'; break; 883 case '\\': c = '\\'; break; 884 case '?': c = '?'; break; 885 case '\'': c = '\''; break; 886 case '"': c = '\"'; break; 887 888 default: { 889 if ((c == 'x') && (i < len) && isHex(s.charAt(i))) { 890 // "\xXX" 891 int v = hexValue(s.charAt(i++)); 892 if ((i < len) && isHex(s.charAt(i))) { 893 v = v * 16 + hexValue(s.charAt(i++)); 894 } 895 c = (char) v; 896 } else if (isOctal(c)) { 897 // "\OOO" 898 int v = (c - '0'); 899 if ((i < len) && isOctal(s.charAt(i))) { 900 v = v * 8 + (s.charAt(i++) - '0'); 901 } 902 if ((i < len) && isOctal(s.charAt(i))) { 903 v = v * 8 + (s.charAt(i++) - '0'); 904 } 905 c = (char) v; 906 } else { 907 // Propagate unknown escape sequences. 908 sb.append('\\'); 909 } 910 break; 911 } 912 } 913 } 914 sb.append(c); 915 } 916 return sb.toString(); 917 } 918 919 /** 920 * Unescape any MySQL escape sequences. 921 * See MySQL language reference Chapter 6 at 922 * <a href="http://www.mysql.com/doc/">http://www.mysql.com/doc/</a>. 923 * This function will <strong>not</strong> work for other SQL-like 924 * dialects. 925 * @param s string to unescape, with the surrounding quotes. 926 * @return unescaped string, without the surrounding quotes. 927 * @exception IllegalArgumentException if s is not a valid MySQL string. 928 */ unescapeMySQLString(String s)929 public static String unescapeMySQLString(String s) 930 throws IllegalArgumentException { 931 // note: the same buffer is used for both reading and writing 932 // it works because the writer can never outrun the reader 933 char chars[] = s.toCharArray(); 934 935 // the string must be quoted 'like this' or "like this" 936 if (chars.length < 2 || chars[0] != chars[chars.length - 1] || 937 (chars[0] != '\'' && chars[0] != '"')) { 938 throw new IllegalArgumentException("not a valid MySQL string: " + s); 939 } 940 941 // parse the string and decode the backslash sequences; in addition, 942 // quotes can be escaped 'like this: ''', "like this: """, or 'like this: "' 943 int j = 1; // write position in the string (never exceeds read position) 944 int f = 0; // state: 0 (normal), 1 (backslash), 2 (quote) 945 for (int i = 1; i < chars.length - 1; i++) { 946 if (f == 0) { // previous character was normal 947 if (chars[i] == '\\') { 948 f = 1; // backslash 949 } else if (chars[i] == chars[0]) { 950 f = 2; // quoting character 951 } else { 952 chars[j++] = chars[i]; 953 } 954 } else if (f == 1) { // previous character was a backslash 955 switch (chars[i]) { 956 case '0': chars[j++] = '\0'; break; 957 case '\'': chars[j++] = '\''; break; 958 case '"': chars[j++] = '"'; break; 959 case 'b': chars[j++] = '\b'; break; 960 case 'n': chars[j++] = '\n'; break; 961 case 'r': chars[j++] = '\r'; break; 962 case 't': chars[j++] = '\t'; break; 963 case 'z': chars[j++] = '\032'; break; 964 case '\\': chars[j++] = '\\'; break; 965 default: 966 // if the character is not special, backslash disappears 967 chars[j++] = chars[i]; 968 break; 969 } 970 f = 0; 971 } else { // previous character was a quote 972 // quoting characters must be doubled inside a string 973 if (chars[i] != chars[0]) { 974 throw new IllegalArgumentException("not a valid MySQL string: " + s); 975 } 976 chars[j++] = chars[0]; 977 f = 0; 978 } 979 } 980 // string contents cannot end with a special character 981 if (f != 0) { 982 throw new IllegalArgumentException("not a valid MySQL string: " + s); 983 } 984 985 // done 986 return new String(chars, 1, j - 1); 987 } 988 989 // TODO(pbarry): move all HTML methods to common.html package 990 991 static final Map<String, Character> ESCAPE_STRINGS; 992 static final Set<Character> HEX_LETTERS; 993 994 static { 995 // HTML character entity references as defined in HTML 4 996 // see http://www.w3.org/TR/REC-html40/sgml/entities.html 997 ESCAPE_STRINGS = new HashMap<String, Character>(252); 998 999 ESCAPE_STRINGS.put(" ", '\u00A0'); 1000 ESCAPE_STRINGS.put("¡", '\u00A1'); 1001 ESCAPE_STRINGS.put("¢", '\u00A2'); 1002 ESCAPE_STRINGS.put("£", '\u00A3'); 1003 ESCAPE_STRINGS.put("¤", '\u00A4'); 1004 ESCAPE_STRINGS.put("¥", '\u00A5'); 1005 ESCAPE_STRINGS.put("¦", '\u00A6'); 1006 ESCAPE_STRINGS.put("§", '\u00A7'); 1007 ESCAPE_STRINGS.put("¨", '\u00A8'); 1008 ESCAPE_STRINGS.put("©", '\u00A9'); 1009 ESCAPE_STRINGS.put("ª", '\u00AA'); 1010 ESCAPE_STRINGS.put("«", '\u00AB'); 1011 ESCAPE_STRINGS.put("¬", '\u00AC'); 1012 ESCAPE_STRINGS.put("­", '\u00AD'); 1013 ESCAPE_STRINGS.put("®", '\u00AE'); 1014 ESCAPE_STRINGS.put("¯", '\u00AF'); 1015 ESCAPE_STRINGS.put("°", '\u00B0'); 1016 ESCAPE_STRINGS.put("±", '\u00B1'); 1017 ESCAPE_STRINGS.put("²", '\u00B2'); 1018 ESCAPE_STRINGS.put("³", '\u00B3'); 1019 ESCAPE_STRINGS.put("´", '\u00B4'); 1020 ESCAPE_STRINGS.put("µ", '\u00B5'); 1021 ESCAPE_STRINGS.put("¶", '\u00B6'); 1022 ESCAPE_STRINGS.put("·", '\u00B7'); 1023 ESCAPE_STRINGS.put("¸", '\u00B8'); 1024 ESCAPE_STRINGS.put("¹", '\u00B9'); 1025 ESCAPE_STRINGS.put("º", '\u00BA'); 1026 ESCAPE_STRINGS.put("»", '\u00BB'); 1027 ESCAPE_STRINGS.put("¼", '\u00BC'); 1028 ESCAPE_STRINGS.put("½", '\u00BD'); 1029 ESCAPE_STRINGS.put("¾", '\u00BE'); 1030 ESCAPE_STRINGS.put("¿", '\u00BF'); 1031 ESCAPE_STRINGS.put("À", '\u00C0'); 1032 ESCAPE_STRINGS.put("Á", '\u00C1'); 1033 ESCAPE_STRINGS.put("Â", '\u00C2'); 1034 ESCAPE_STRINGS.put("Ã", '\u00C3'); 1035 ESCAPE_STRINGS.put("Ä", '\u00C4'); 1036 ESCAPE_STRINGS.put("Å", '\u00C5'); 1037 ESCAPE_STRINGS.put("Æ", '\u00C6'); 1038 ESCAPE_STRINGS.put("Ç", '\u00C7'); 1039 ESCAPE_STRINGS.put("È", '\u00C8'); 1040 ESCAPE_STRINGS.put("É", '\u00C9'); 1041 ESCAPE_STRINGS.put("Ê", '\u00CA'); 1042 ESCAPE_STRINGS.put("Ë", '\u00CB'); 1043 ESCAPE_STRINGS.put("Ì", '\u00CC'); 1044 ESCAPE_STRINGS.put("Í", '\u00CD'); 1045 ESCAPE_STRINGS.put("Î", '\u00CE'); 1046 ESCAPE_STRINGS.put("Ï", '\u00CF'); 1047 ESCAPE_STRINGS.put("Ð", '\u00D0'); 1048 ESCAPE_STRINGS.put("Ñ", '\u00D1'); 1049 ESCAPE_STRINGS.put("Ò", '\u00D2'); 1050 ESCAPE_STRINGS.put("Ó", '\u00D3'); 1051 ESCAPE_STRINGS.put("Ô", '\u00D4'); 1052 ESCAPE_STRINGS.put("Õ", '\u00D5'); 1053 ESCAPE_STRINGS.put("Ö", '\u00D6'); 1054 ESCAPE_STRINGS.put("×", '\u00D7'); 1055 ESCAPE_STRINGS.put("Ø", '\u00D8'); 1056 ESCAPE_STRINGS.put("Ù", '\u00D9'); 1057 ESCAPE_STRINGS.put("Ú", '\u00DA'); 1058 ESCAPE_STRINGS.put("Û", '\u00DB'); 1059 ESCAPE_STRINGS.put("Ü", '\u00DC'); 1060 ESCAPE_STRINGS.put("Ý", '\u00DD'); 1061 ESCAPE_STRINGS.put("Þ", '\u00DE'); 1062 ESCAPE_STRINGS.put("ß", '\u00DF'); 1063 ESCAPE_STRINGS.put("à", '\u00E0'); 1064 ESCAPE_STRINGS.put("á", '\u00E1'); 1065 ESCAPE_STRINGS.put("â", '\u00E2'); 1066 ESCAPE_STRINGS.put("ã", '\u00E3'); 1067 ESCAPE_STRINGS.put("ä", '\u00E4'); 1068 ESCAPE_STRINGS.put("å", '\u00E5'); 1069 ESCAPE_STRINGS.put("æ", '\u00E6'); 1070 ESCAPE_STRINGS.put("ç", '\u00E7'); 1071 ESCAPE_STRINGS.put("è", '\u00E8'); 1072 ESCAPE_STRINGS.put("é", '\u00E9'); 1073 ESCAPE_STRINGS.put("ê", '\u00EA'); 1074 ESCAPE_STRINGS.put("ë", '\u00EB'); 1075 ESCAPE_STRINGS.put("ì", '\u00EC'); 1076 ESCAPE_STRINGS.put("í", '\u00ED'); 1077 ESCAPE_STRINGS.put("î", '\u00EE'); 1078 ESCAPE_STRINGS.put("ï", '\u00EF'); 1079 ESCAPE_STRINGS.put("ð", '\u00F0'); 1080 ESCAPE_STRINGS.put("ñ", '\u00F1'); 1081 ESCAPE_STRINGS.put("ò", '\u00F2'); 1082 ESCAPE_STRINGS.put("ó", '\u00F3'); 1083 ESCAPE_STRINGS.put("ô", '\u00F4'); 1084 ESCAPE_STRINGS.put("õ", '\u00F5'); 1085 ESCAPE_STRINGS.put("ö", '\u00F6'); 1086 ESCAPE_STRINGS.put("÷", '\u00F7'); 1087 ESCAPE_STRINGS.put("ø", '\u00F8'); 1088 ESCAPE_STRINGS.put("ù", '\u00F9'); 1089 ESCAPE_STRINGS.put("ú", '\u00FA'); 1090 ESCAPE_STRINGS.put("û", '\u00FB'); 1091 ESCAPE_STRINGS.put("ü", '\u00FC'); 1092 ESCAPE_STRINGS.put("ý", '\u00FD'); 1093 ESCAPE_STRINGS.put("þ", '\u00FE'); 1094 ESCAPE_STRINGS.put("ÿ", '\u00FF'); 1095 ESCAPE_STRINGS.put("&fnof", '\u0192'); 1096 ESCAPE_STRINGS.put("&Alpha", '\u0391'); 1097 ESCAPE_STRINGS.put("&Beta", '\u0392'); 1098 ESCAPE_STRINGS.put("&Gamma", '\u0393'); 1099 ESCAPE_STRINGS.put("&Delta", '\u0394'); 1100 ESCAPE_STRINGS.put("&Epsilon", '\u0395'); 1101 ESCAPE_STRINGS.put("&Zeta", '\u0396'); 1102 ESCAPE_STRINGS.put("&Eta", '\u0397'); 1103 ESCAPE_STRINGS.put("&Theta", '\u0398'); 1104 ESCAPE_STRINGS.put("&Iota", '\u0399'); 1105 ESCAPE_STRINGS.put("&Kappa", '\u039A'); 1106 ESCAPE_STRINGS.put("&Lambda", '\u039B'); 1107 ESCAPE_STRINGS.put("&Mu", '\u039C'); 1108 ESCAPE_STRINGS.put("&Nu", '\u039D'); 1109 ESCAPE_STRINGS.put("&Xi", '\u039E'); 1110 ESCAPE_STRINGS.put("&Omicron", '\u039F'); 1111 ESCAPE_STRINGS.put("&Pi", '\u03A0'); 1112 ESCAPE_STRINGS.put("&Rho", '\u03A1'); 1113 ESCAPE_STRINGS.put("&Sigma", '\u03A3'); 1114 ESCAPE_STRINGS.put("&Tau", '\u03A4'); 1115 ESCAPE_STRINGS.put("&Upsilon", '\u03A5'); 1116 ESCAPE_STRINGS.put("&Phi", '\u03A6'); 1117 ESCAPE_STRINGS.put("&Chi", '\u03A7'); 1118 ESCAPE_STRINGS.put("&Psi", '\u03A8'); 1119 ESCAPE_STRINGS.put("&Omega", '\u03A9'); 1120 ESCAPE_STRINGS.put("&alpha", '\u03B1'); 1121 ESCAPE_STRINGS.put("&beta", '\u03B2'); 1122 ESCAPE_STRINGS.put("&gamma", '\u03B3'); 1123 ESCAPE_STRINGS.put("&delta", '\u03B4'); 1124 ESCAPE_STRINGS.put("&epsilon", '\u03B5'); 1125 ESCAPE_STRINGS.put("&zeta", '\u03B6'); 1126 ESCAPE_STRINGS.put("&eta", '\u03B7'); 1127 ESCAPE_STRINGS.put("&theta", '\u03B8'); 1128 ESCAPE_STRINGS.put("&iota", '\u03B9'); 1129 ESCAPE_STRINGS.put("&kappa", '\u03BA'); 1130 ESCAPE_STRINGS.put("&lambda", '\u03BB'); 1131 ESCAPE_STRINGS.put("&mu", '\u03BC'); 1132 ESCAPE_STRINGS.put("&nu", '\u03BD'); 1133 ESCAPE_STRINGS.put("&xi", '\u03BE'); 1134 ESCAPE_STRINGS.put("&omicron", '\u03BF'); 1135 ESCAPE_STRINGS.put("&pi", '\u03C0'); 1136 ESCAPE_STRINGS.put("&rho", '\u03C1'); 1137 ESCAPE_STRINGS.put("&sigmaf", '\u03C2'); 1138 ESCAPE_STRINGS.put("&sigma", '\u03C3'); 1139 ESCAPE_STRINGS.put("&tau", '\u03C4'); 1140 ESCAPE_STRINGS.put("&upsilon", '\u03C5'); 1141 ESCAPE_STRINGS.put("&phi", '\u03C6'); 1142 ESCAPE_STRINGS.put("&chi", '\u03C7'); 1143 ESCAPE_STRINGS.put("&psi", '\u03C8'); 1144 ESCAPE_STRINGS.put("&omega", '\u03C9'); 1145 ESCAPE_STRINGS.put("&thetasym", '\u03D1'); 1146 ESCAPE_STRINGS.put("&upsih", '\u03D2'); 1147 ESCAPE_STRINGS.put("&piv", '\u03D6'); 1148 ESCAPE_STRINGS.put("&bull", '\u2022'); 1149 ESCAPE_STRINGS.put("&hellip", '\u2026'); 1150 ESCAPE_STRINGS.put("&prime", '\u2032'); 1151 ESCAPE_STRINGS.put("&Prime", '\u2033'); 1152 ESCAPE_STRINGS.put("&oline", '\u203E'); 1153 ESCAPE_STRINGS.put("&frasl", '\u2044'); 1154 ESCAPE_STRINGS.put("&weierp", '\u2118'); 1155 ESCAPE_STRINGS.put("&image", '\u2111'); 1156 ESCAPE_STRINGS.put("&real", '\u211C'); 1157 ESCAPE_STRINGS.put("&trade", '\u2122'); 1158 ESCAPE_STRINGS.put("&alefsym", '\u2135'); 1159 ESCAPE_STRINGS.put("&larr", '\u2190'); 1160 ESCAPE_STRINGS.put("&uarr", '\u2191'); 1161 ESCAPE_STRINGS.put("&rarr", '\u2192'); 1162 ESCAPE_STRINGS.put("&darr", '\u2193'); 1163 ESCAPE_STRINGS.put("&harr", '\u2194'); 1164 ESCAPE_STRINGS.put("&crarr", '\u21B5'); 1165 ESCAPE_STRINGS.put("&lArr", '\u21D0'); 1166 ESCAPE_STRINGS.put("&uArr", '\u21D1'); 1167 ESCAPE_STRINGS.put("&rArr", '\u21D2'); 1168 ESCAPE_STRINGS.put("&dArr", '\u21D3'); 1169 ESCAPE_STRINGS.put("&hArr", '\u21D4'); 1170 ESCAPE_STRINGS.put("&forall", '\u2200'); 1171 ESCAPE_STRINGS.put("&part", '\u2202'); 1172 ESCAPE_STRINGS.put("&exist", '\u2203'); 1173 ESCAPE_STRINGS.put("&empty", '\u2205'); 1174 ESCAPE_STRINGS.put("&nabla", '\u2207'); 1175 ESCAPE_STRINGS.put("&isin", '\u2208'); 1176 ESCAPE_STRINGS.put("¬in", '\u2209'); 1177 ESCAPE_STRINGS.put("&ni", '\u220B'); 1178 ESCAPE_STRINGS.put("&prod", '\u220F'); 1179 ESCAPE_STRINGS.put("&sum", '\u2211'); 1180 ESCAPE_STRINGS.put("&minus", '\u2212'); 1181 ESCAPE_STRINGS.put("&lowast", '\u2217'); 1182 ESCAPE_STRINGS.put("&radic", '\u221A'); 1183 ESCAPE_STRINGS.put("&prop", '\u221D'); 1184 ESCAPE_STRINGS.put("&infin", '\u221E'); 1185 ESCAPE_STRINGS.put("&ang", '\u2220'); 1186 ESCAPE_STRINGS.put("&and", '\u2227'); 1187 ESCAPE_STRINGS.put("&or", '\u2228'); 1188 ESCAPE_STRINGS.put("&cap", '\u2229'); 1189 ESCAPE_STRINGS.put("&cup", '\u222A'); 1190 ESCAPE_STRINGS.put("&int", '\u222B'); 1191 ESCAPE_STRINGS.put("&there4", '\u2234'); 1192 ESCAPE_STRINGS.put("&sim", '\u223C'); 1193 ESCAPE_STRINGS.put("&cong", '\u2245'); 1194 ESCAPE_STRINGS.put("&asymp", '\u2248'); 1195 ESCAPE_STRINGS.put("&ne", '\u2260'); 1196 ESCAPE_STRINGS.put("&equiv", '\u2261'); 1197 ESCAPE_STRINGS.put("&le", '\u2264'); 1198 ESCAPE_STRINGS.put("&ge", '\u2265'); 1199 ESCAPE_STRINGS.put("&sub", '\u2282'); 1200 ESCAPE_STRINGS.put("&sup", '\u2283'); 1201 ESCAPE_STRINGS.put("&nsub", '\u2284'); 1202 ESCAPE_STRINGS.put("&sube", '\u2286'); 1203 ESCAPE_STRINGS.put("&supe", '\u2287'); 1204 ESCAPE_STRINGS.put("&oplus", '\u2295'); 1205 ESCAPE_STRINGS.put("&otimes", '\u2297'); 1206 ESCAPE_STRINGS.put("&perp", '\u22A5'); 1207 ESCAPE_STRINGS.put("&sdot", '\u22C5'); 1208 ESCAPE_STRINGS.put("&lceil", '\u2308'); 1209 ESCAPE_STRINGS.put("&rceil", '\u2309'); 1210 ESCAPE_STRINGS.put("&lfloor", '\u230A'); 1211 ESCAPE_STRINGS.put("&rfloor", '\u230B'); 1212 ESCAPE_STRINGS.put("&lang", '\u2329'); 1213 ESCAPE_STRINGS.put("&rang", '\u232A'); 1214 ESCAPE_STRINGS.put("&loz", '\u25CA'); 1215 ESCAPE_STRINGS.put("&spades", '\u2660'); 1216 ESCAPE_STRINGS.put("&clubs", '\u2663'); 1217 ESCAPE_STRINGS.put("&hearts", '\u2665'); 1218 ESCAPE_STRINGS.put("&diams", '\u2666'); 1219 ESCAPE_STRINGS.put(""", '\u0022'); 1220 ESCAPE_STRINGS.put("&", '\u0026'); 1221 ESCAPE_STRINGS.put("<", '\u003C'); 1222 ESCAPE_STRINGS.put(">", '\u003E'); 1223 ESCAPE_STRINGS.put("&OElig", '\u0152'); 1224 ESCAPE_STRINGS.put("&oelig", '\u0153'); 1225 ESCAPE_STRINGS.put("&Scaron", '\u0160'); 1226 ESCAPE_STRINGS.put("&scaron", '\u0161'); 1227 ESCAPE_STRINGS.put("&Yuml", '\u0178'); 1228 ESCAPE_STRINGS.put("&circ", '\u02C6'); 1229 ESCAPE_STRINGS.put("&tilde", '\u02DC'); 1230 ESCAPE_STRINGS.put("&ensp", '\u2002'); 1231 ESCAPE_STRINGS.put("&emsp", '\u2003'); 1232 ESCAPE_STRINGS.put("&thinsp", '\u2009'); 1233 ESCAPE_STRINGS.put("&zwnj", '\u200C'); 1234 ESCAPE_STRINGS.put("&zwj", '\u200D'); 1235 ESCAPE_STRINGS.put("&lrm", '\u200E'); 1236 ESCAPE_STRINGS.put("&rlm", '\u200F'); 1237 ESCAPE_STRINGS.put("&ndash", '\u2013'); 1238 ESCAPE_STRINGS.put("&mdash", '\u2014'); 1239 ESCAPE_STRINGS.put("&lsquo", '\u2018'); 1240 ESCAPE_STRINGS.put("&rsquo", '\u2019'); 1241 ESCAPE_STRINGS.put("&sbquo", '\u201A'); 1242 ESCAPE_STRINGS.put("&ldquo", '\u201C'); 1243 ESCAPE_STRINGS.put("&rdquo", '\u201D'); 1244 ESCAPE_STRINGS.put("&bdquo", '\u201E'); 1245 ESCAPE_STRINGS.put("&dagger", '\u2020'); 1246 ESCAPE_STRINGS.put("&Dagger", '\u2021'); 1247 ESCAPE_STRINGS.put("&permil", '\u2030'); 1248 ESCAPE_STRINGS.put("&lsaquo", '\u2039'); 1249 ESCAPE_STRINGS.put("&rsaquo", '\u203A'); 1250 ESCAPE_STRINGS.put("&euro", '\u20AC'); 1251 1252 HEX_LETTERS = new HashSet<Character>(12); 1253 1254 HEX_LETTERS.add('a'); 1255 HEX_LETTERS.add('A'); 1256 HEX_LETTERS.add('b'); 1257 HEX_LETTERS.add('B'); 1258 HEX_LETTERS.add('c'); 1259 HEX_LETTERS.add('C'); 1260 HEX_LETTERS.add('d'); 1261 HEX_LETTERS.add('D'); 1262 HEX_LETTERS.add('e'); 1263 HEX_LETTERS.add('E'); 1264 HEX_LETTERS.add('f'); 1265 HEX_LETTERS.add('F'); 1266 } 1267 1268 /** 1269 * <p> 1270 * Replace all the occurences of HTML escape strings with the 1271 * respective characters. 1272 * </p> 1273 * <p> 1274 * The default mode is strict (requiring semicolons). 1275 * </p> 1276 * 1277 * @param s a <code>String</code> value 1278 * @return a <code>String</code> value 1279 * @throws NullPointerException if the input string is null. 1280 */ unescapeHTML(String s)1281 public static final String unescapeHTML(String s) { 1282 return unescapeHTML(s, false); 1283 } 1284 1285 /** 1286 * Replace all the occurences of HTML escape strings with the 1287 * respective characters. 1288 * 1289 * @param s a <code>String</code> value 1290 * @param emulateBrowsers a <code>Boolean</code> value that tells the method 1291 * to allow entity refs not terminated with a semicolon to be unescaped. 1292 * (a quirk of this feature, and some browsers, is that an explicit 1293 * terminating character is needed - e.g., <$ would be unescaped, but 1294 * not <ab - see the tests for a more in-depth description of browsers) 1295 * @return a <code>String</code> value 1296 * @throws NullPointerException if the input string is null. 1297 */ unescapeHTML(String s, boolean emulateBrowsers)1298 public static final String unescapeHTML(String s, boolean emulateBrowsers) { 1299 1300 // See if there are any '&' in the string since that is what we look 1301 // for to escape. If there isn't, then we don't need to escape this string 1302 // Based on similar technique used in the escape function. 1303 int index = s.indexOf('&'); 1304 if (index == -1) { 1305 // Nothing to escape. Return the original string. 1306 return s; 1307 } 1308 1309 // We found an escaped character. Start slow escaping from there. 1310 char[] chars = s.toCharArray(); 1311 char[] escaped = new char[chars.length]; 1312 System.arraycopy(chars, 0, escaped, 0, index); 1313 1314 // Note: escaped[pos] = end of the escaped char array. 1315 int pos = index; 1316 1317 for (int i = index; i < chars.length;) { 1318 if (chars[i] != '&') { 1319 escaped[pos++] = chars[i++]; 1320 continue; 1321 } 1322 1323 // Allow e.g. { 1324 int j = i + 1; 1325 boolean isNumericEntity = false; 1326 if (j < chars.length && chars[j] == '#') { 1327 j++; 1328 isNumericEntity = true; 1329 } 1330 1331 // if it's numeric, also check for hex 1332 boolean isHexEntity = false; 1333 if (j < chars.length && (chars[j] == 'x' || chars[j] == 'X')) { 1334 j++; 1335 isHexEntity = true; 1336 } 1337 1338 // Scan until we find a char that is not valid for this sequence. 1339 for (; j < chars.length; j++) { 1340 char ch = chars[j]; 1341 boolean isDigit = Character.isDigit(ch); 1342 if (isNumericEntity) { 1343 // non-hex numeric sequence end condition 1344 if (!isHexEntity && !isDigit) { 1345 break; 1346 } 1347 // hex sequence end contition 1348 if (isHexEntity && !isDigit && !HEX_LETTERS.contains(ch)) { 1349 break; 1350 } 1351 } 1352 // anything other than a digit or letter is always an end condition 1353 if (!isDigit && !Character.isLetter(ch)) { 1354 break; 1355 } 1356 } 1357 1358 boolean replaced = false; 1359 if ((j <= chars.length && emulateBrowsers) || 1360 (j < chars.length && chars[j] == ';')) { 1361 // Check for &#D; and 
 pattern 1362 if (i + 2 < chars.length && s.charAt(i + 1) == '#') { 1363 try { 1364 long charcode = 0; 1365 char ch = s.charAt(i + 2); 1366 if (isHexEntity) { 1367 charcode = Long.parseLong( 1368 new String(chars, i + 3, j - i - 3), 16); 1369 } else if (Character.isDigit(ch)) { 1370 charcode = Long.parseLong( 1371 new String(chars, i + 2, j - i - 2)); 1372 } 1373 // D800 to DFFF are for UTF16 surrogate pairs, and are not valid HTML entities 1374 // Code points 0xFFFE and 0xFFFF are unicode noncharacters 1375 if ((charcode > 0 && charcode < 0xD800) || (charcode > 0xDFFF && charcode < 0xFFFE)) { 1376 escaped[pos++] = (char) charcode; 1377 replaced = true; 1378 } else if (charcode >= 0x10000 && charcode < 0x110000) { 1379 // These characters are represented as surrogate pairs in UTF16 1380 escaped[pos++] = (char) ((charcode - 0x10000) / 0x400 + 0xD800); 1381 escaped[pos++] = (char) ((charcode - 0x10000) % 0x400 + 0xDC00); 1382 replaced = true; 1383 } 1384 } catch (NumberFormatException ex) { 1385 // Failed, not replaced. 1386 } 1387 } else { 1388 String key = new String(chars, i, j - i); 1389 Character repl = ESCAPE_STRINGS.get(key); 1390 if (repl != null) { 1391 escaped[pos++] = repl; 1392 replaced = true; 1393 } 1394 } 1395 // Skip over ';' 1396 if (j < chars.length && chars[j] == ';') { 1397 j++; 1398 } 1399 } 1400 1401 if (!replaced) { 1402 // Not a recognized escape sequence, leave as-is 1403 System.arraycopy(chars, i, escaped, pos, j - i); 1404 pos += j - i; 1405 } 1406 i = j; 1407 } 1408 return new String(escaped, 0, pos); 1409 } 1410 1411 // Escaper for < and > only. 1412 private static final CharEscaper LT_GT_ESCAPE = 1413 new CharEscaperBuilder() 1414 .addEscape('<', "<") 1415 .addEscape('>', ">") 1416 .toEscaper(); 1417 1418 private static final Pattern htmlTagPattern = 1419 Pattern.compile("</?[a-zA-Z][^>]*>"); 1420 1421 /** 1422 * Given a <code>String</code>, returns an equivalent <code>String</code> with 1423 * all HTML tags stripped. Note that HTML entities, such as "&amp;" will 1424 * still be preserved. 1425 */ stripHtmlTags(String string)1426 public static String stripHtmlTags(String string) { 1427 if ((string == null) || "".equals(string)) { 1428 return string; 1429 } 1430 String stripped = htmlTagPattern.matcher(string).replaceAll(""); 1431 /* 1432 * Certain inputs result in a well-formed HTML: 1433 * <<X>script>alert(0)<</X>/script> results in <script>alert(0)</script> 1434 * The following step ensures that no HTML can slip through by replacing all 1435 * < and > characters with < and > after HTML tags were stripped. 1436 */ 1437 return LT_GT_ESCAPE.escape(stripped); 1438 } 1439 1440 /** 1441 * We escape some characters in s to be able to insert strings into JavaScript 1442 * code. Also, make sure that we don't write out {@code -->} or 1443 * {@code </script>}, which may close a script tag, or any char in ["'>] which 1444 * might close a tag or attribute if seen inside an attribute. 1445 */ javaScriptEscape(CharSequence s)1446 public static String javaScriptEscape(CharSequence s) { 1447 return javaScriptEscapeHelper(s, false); 1448 } 1449 1450 /** 1451 * We escape some characters in s to be able to insert strings into JavaScript 1452 * code. Also, make sure that we don't write out {@code -->} or 1453 * {@code </script>}, which may close a script tag, or any char in ["'>] which 1454 * might close a tag or attribute if seen inside an attribute. 1455 * Turns all non-ascii characters into ASCII javascript escape sequences 1456 * (eg \\uhhhh or \ooo). 1457 */ javaScriptEscapeToAscii(CharSequence s)1458 public static String javaScriptEscapeToAscii(CharSequence s) { 1459 return javaScriptEscapeHelper(s, true); 1460 } 1461 1462 /** 1463 * Represents the type of javascript escaping to perform. Each enum below 1464 * determines whether to use octal escapes and how to handle quotes. 1465 */ 1466 public static enum JsEscapingMode { 1467 /** No octal escapes, pass-through ', and escape " as \". */ 1468 JSON, 1469 1470 /** Octal escapes, escapes ' and " to \42 and \47, respectively. */ 1471 EMBEDDABLE_JS, 1472 1473 /** Octal escapes, escapes ' and " to \' and \". */ 1474 MINIMAL_JS 1475 } 1476 1477 /** 1478 * Helper for javaScriptEscape and javaScriptEscapeToAscii 1479 */ javaScriptEscapeHelper(CharSequence s, boolean escapeToAscii)1480 private static String javaScriptEscapeHelper(CharSequence s, 1481 boolean escapeToAscii) { 1482 StringBuilder sb = new StringBuilder(s.length() * 9 / 8); 1483 try { 1484 escapeStringBody(s, escapeToAscii, JsEscapingMode.EMBEDDABLE_JS, sb); 1485 } catch (IOException ex) { 1486 // StringBuilder.append does not throw IOExceptions. 1487 throw new RuntimeException(ex); 1488 } 1489 return sb.toString(); 1490 } 1491 1492 /** 1493 * Appends the javascript string literal equivalent of plainText to the given 1494 * out buffer. 1495 * @param plainText the string to escape. 1496 * @param escapeToAscii true to encode all characters not in ascii [\x20-\x7e] 1497 * <br> 1498 * Full escaping of unicode entites isn't required but this makes 1499 * sure that unicode strings will survive regardless of the 1500 * content-encoding of the javascript file which is important when 1501 * we use this function to autogenerated javascript source files. 1502 * This is disabled by default because it makes non-latin strings very long. 1503 * <br> 1504 * If you seem to have trouble with character-encodings, maybe 1505 * turn this on to see if the problem goes away. If so, you need 1506 * to specify a character encoding for your javascript somewhere. 1507 * @param jsEscapingMode determines the type of escaping to perform. 1508 * @param out the buffer to append output to. 1509 */ 1510 /* 1511 * To avoid fallthrough, we would have to either use a hybrid switch-case/if 1512 * approach (which would obscure our special handling for ' and "), duplicate 1513 * the content of the default case, or pass a half-dozen parameters to a 1514 * helper method containing the code from the default case. 1515 */ 1516 @SuppressWarnings("fallthrough") escapeStringBody( CharSequence plainText, boolean escapeToAscii, JsEscapingMode jsEscapingMode, Appendable out)1517 public static void escapeStringBody( 1518 CharSequence plainText, boolean escapeToAscii, 1519 JsEscapingMode jsEscapingMode, Appendable out) 1520 throws IOException { 1521 int pos = 0; // Index just past the last char in plainText written to out. 1522 int len = plainText.length(); 1523 for (int codePoint, charCount, i = 0; i < len; i += charCount) { 1524 codePoint = Character.codePointAt(plainText, i); 1525 charCount = Character.charCount(codePoint); 1526 1527 if (!shouldEscapeChar(codePoint, escapeToAscii, jsEscapingMode)) { 1528 continue; 1529 } 1530 1531 out.append(plainText, pos, i); 1532 pos = i + charCount; 1533 switch (codePoint) { 1534 case '\b': out.append("\\b"); break; 1535 case '\t': out.append("\\t"); break; 1536 case '\n': out.append("\\n"); break; 1537 case '\f': out.append("\\f"); break; 1538 case '\r': out.append("\\r"); break; 1539 case '\\': out.append("\\\\"); break; 1540 case '"': case '\'': 1541 if (jsEscapingMode == JsEscapingMode.JSON && '\'' == codePoint) { 1542 // JSON does not escape a single quote (and it should be surrounded 1543 // by double quotes). 1544 out.append((char) codePoint); 1545 break; 1546 } else if (jsEscapingMode != JsEscapingMode.EMBEDDABLE_JS) { 1547 out.append('\\').append((char) codePoint); 1548 break; 1549 } 1550 // fall through 1551 default: 1552 if (codePoint >= 0x100 || jsEscapingMode == JsEscapingMode.JSON) { 1553 appendHexJavaScriptRepresentation(codePoint, out); 1554 } else { 1555 // Output the minimal octal encoding. We can't use an encoding 1556 // shorter than three digits if the next digit is a valid octal 1557 // digit. 1558 boolean pad = i + charCount >= len 1559 || isOctal(plainText.charAt(i + charCount)); 1560 appendOctalJavaScriptRepresentation((char) codePoint, pad, out); 1561 } 1562 break; 1563 } 1564 } 1565 out.append(plainText, pos, len); 1566 } 1567 1568 /** 1569 * Helper for escapeStringBody, which decides whether to escape a character. 1570 */ shouldEscapeChar(int codePoint, boolean escapeToAscii, JsEscapingMode jsEscapingMode)1571 private static boolean shouldEscapeChar(int codePoint, 1572 boolean escapeToAscii, JsEscapingMode jsEscapingMode) { 1573 // If non-ASCII chars should be escaped, identify non-ASCII code points. 1574 if (escapeToAscii && (codePoint < 0x20 || codePoint > 0x7e)) { 1575 return true; 1576 } 1577 1578 // If in JSON escaping mode, check JSON *and* JS escaping rules. The JS 1579 // escaping rules will escape more characters than needed for JSON, 1580 // but it is safe to escape any character in JSON. 1581 // TODO(bbavar): Remove unnecessary escaping for JSON, as long as it can be 1582 // shown that this change in legacy behavior is safe. 1583 if (jsEscapingMode == JsEscapingMode.JSON) { 1584 return mustEscapeCharInJsonString(codePoint) 1585 || mustEscapeCharInJsString(codePoint); 1586 } 1587 1588 // Finally, just check the default JS escaping rules. 1589 return mustEscapeCharInJsString(codePoint); 1590 } 1591 1592 /** 1593 * Returns a javascript representation of the character in a hex escaped 1594 * format. 1595 * 1596 * @param codePoint The codepoint to append. 1597 * @param out The buffer to which the hex representation should be appended. 1598 */ appendHexJavaScriptRepresentation( int codePoint, Appendable out)1599 private static void appendHexJavaScriptRepresentation( 1600 int codePoint, Appendable out) 1601 throws IOException { 1602 if (Character.isSupplementaryCodePoint(codePoint)) { 1603 // Handle supplementary unicode values which are not representable in 1604 // javascript. We deal with these by escaping them as two 4B sequences 1605 // so that they will round-trip properly when sent from java to javascript 1606 // and back. 1607 char[] surrogates = Character.toChars(codePoint); 1608 appendHexJavaScriptRepresentation(surrogates[0], out); 1609 appendHexJavaScriptRepresentation(surrogates[1], out); 1610 return; 1611 } 1612 out.append("\\u") 1613 .append(HEX_CHARS[(codePoint >>> 12) & 0xf]) 1614 .append(HEX_CHARS[(codePoint >>> 8) & 0xf]) 1615 .append(HEX_CHARS[(codePoint >>> 4) & 0xf]) 1616 .append(HEX_CHARS[codePoint & 0xf]); 1617 } 1618 1619 /** 1620 * Returns a javascript representation of the character in a hex escaped 1621 * format. Although this is a rather specific method, it is made public 1622 * because it is also used by the JSCompiler. 1623 * 1624 * @param ch The character to append. 1625 * @param pad true to force use of the full 3 digit representation. 1626 * @param out The buffer to which the hex representation should be appended. 1627 */ appendOctalJavaScriptRepresentation( char ch, boolean pad, Appendable out)1628 private static void appendOctalJavaScriptRepresentation( 1629 char ch, boolean pad, Appendable out) throws IOException { 1630 if (ch >= 0100 1631 // Be paranoid at the end of a string since someone might call 1632 // this method again with another string segment. 1633 || pad) { 1634 out.append('\\') 1635 .append(OCTAL_CHARS[(ch >>> 6) & 0x7]) 1636 .append(OCTAL_CHARS[(ch >>> 3) & 0x7]) 1637 .append(OCTAL_CHARS[ch & 0x7]); 1638 } else if (ch >= 010) { 1639 out.append('\\') 1640 .append(OCTAL_CHARS[(ch >>> 3) & 0x7]) 1641 .append(OCTAL_CHARS[ch & 0x7]); 1642 } else { 1643 out.append('\\') 1644 .append(OCTAL_CHARS[ch & 0x7]); 1645 } 1646 } 1647 1648 /** 1649 * Although this is a rather specific method, it is made public 1650 * because it is also used by the JSCompiler. 1651 * 1652 * @see #appendHexJavaScriptRepresentation(int, Appendable) 1653 */ appendHexJavaScriptRepresentation(StringBuilder sb, char c)1654 public static void appendHexJavaScriptRepresentation(StringBuilder sb, 1655 char c) { 1656 try { 1657 appendHexJavaScriptRepresentation(c, sb); 1658 } catch (IOException ex) { 1659 // StringBuilder does not throw IOException. 1660 throw new RuntimeException(ex); 1661 } 1662 } 1663 1664 /** 1665 * Undo escaping as performed in javaScriptEscape(.) 1666 * Throws an IllegalArgumentException if the string contains 1667 * bad escaping. 1668 */ javaScriptUnescape(String s)1669 public static String javaScriptUnescape(String s) { 1670 StringBuilder sb = new StringBuilder(s.length()); 1671 for (int i = 0; i < s.length(); ) { 1672 char c = s.charAt(i); 1673 if (c == '\\') { 1674 i = javaScriptUnescapeHelper(s, i + 1, sb); 1675 } else { 1676 sb.append(c); 1677 i++; 1678 } 1679 } 1680 return sb.toString(); 1681 } 1682 1683 /** 1684 * Looks for an escape code starting at index i of s, 1685 * and appends it to sb. 1686 * @return the index of the first character in s 1687 * after the escape code. 1688 * @throws IllegalArgumentException if the escape code 1689 * is invalid 1690 */ javaScriptUnescapeHelper(String s, int i, StringBuilder sb)1691 private static int javaScriptUnescapeHelper(String s, int i, 1692 StringBuilder sb) { 1693 if (i >= s.length()) { 1694 throw new IllegalArgumentException( 1695 "End-of-string after escape character in [" + s + "]"); 1696 } 1697 1698 char c = s.charAt(i++); 1699 switch (c) { 1700 case 'n': sb.append('\n'); break; 1701 case 'r': sb.append('\r'); break; 1702 case 't': sb.append('\t'); break; 1703 case 'b': sb.append('\b'); break; 1704 case 'f': sb.append('\f'); break; 1705 case '\\': 1706 case '\"': 1707 case '\'': 1708 case '>': 1709 sb.append(c); 1710 break; 1711 case '0': case '1': case '2': case '3': 1712 case '4': case '5': case '6': case '7': 1713 --i; // backup to first octal digit 1714 int nOctalDigits = 1; 1715 int digitLimit = c < '4' ? 3 : 2; 1716 while (nOctalDigits < digitLimit && i + nOctalDigits < s.length() 1717 && isOctal(s.charAt(i + nOctalDigits))) { 1718 ++nOctalDigits; 1719 } 1720 sb.append( 1721 (char) Integer.parseInt(s.substring(i, i + nOctalDigits), 8)); 1722 i += nOctalDigits; 1723 break; 1724 case 'x': 1725 case 'u': 1726 String hexCode; 1727 int nHexDigits = (c == 'u' ? 4 : 2); 1728 try { 1729 hexCode = s.substring(i, i + nHexDigits); 1730 } catch (IndexOutOfBoundsException ioobe) { 1731 throw new IllegalArgumentException( 1732 "Invalid unicode sequence [" + s.substring(i) + "] at index " + i 1733 + " in [" + s + "]"); 1734 } 1735 int unicodeValue; 1736 try { 1737 unicodeValue = Integer.parseInt(hexCode, 16); 1738 } catch (NumberFormatException nfe) { 1739 throw new IllegalArgumentException( 1740 "Invalid unicode sequence [" + hexCode + "] at index " + i + 1741 " in [" + s + "]"); 1742 } 1743 sb.append((char) unicodeValue); 1744 i += nHexDigits; 1745 break; 1746 default: 1747 throw new IllegalArgumentException( 1748 "Unknown escape code [" + c + "] at index " + i + " in [" + s + "]" 1749 ); 1750 } 1751 1752 return i; 1753 } 1754 1755 // C0 control characters except \t, \n, and \r and 0xFFFE and 0xFFFF 1756 private static final CharMatcher CONTROL_MATCHER = CharMatcher.anyOf( 1757 "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" + 1758 "\u0008\u000B\u000C\u000E\u000F" + 1759 "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" + 1760 "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + 1761 "\uFFFE\uFFFF"); 1762 1763 /** 1764 * Escape a string that is meant to be embedded in a CDATA section. 1765 * The returned string is guaranteed to be valid CDATA content. 1766 * The syntax of CDATA sections is the following: 1767 * <blockquote> 1768 * <code><[!CDATA[...]]></code> 1769 * </blockquote> 1770 * The only invalid character sequence in a CDATA tag is "]]>". 1771 * If this sequence is present in the input string, we replace 1772 * it by closing the current CDATA field, then write ']]&gt;', 1773 * then reopen a new CDATA section. 1774 */ 1775 public static String xmlCDataEscape(String s) { 1776 // Make sure there are no illegal control characters. 1777 s = CONTROL_MATCHER.removeFrom(s); 1778 // Return the original reference if the string doesn't have a match. 1779 int found = s.indexOf("]]>"); 1780 if (found == -1) { 1781 return s; 1782 } 1783 1784 // For each occurrence of "]]>", append a string that adds "]]>" after 1785 // the end of the CDATA which has just been closed, then opens a new CDATA. 1786 StringBuilder sb = new StringBuilder(); 1787 int prev = 0; 1788 do { 1789 sb.append(s.substring(prev, found + 3)); 1790 sb.append("]]><![CDATA["); 1791 prev = found + 3; 1792 } while ((found = s.indexOf("]]>", prev)) != -1); 1793 sb.append(s.substring(prev)); 1794 return sb.toString(); 1795 } 1796 1797 /** 1798 * We escape some characters in s to be able to insert strings into Java code 1799 * 1800 * @deprecated Use {@link CharEscapers#asciiHtmlEscaper()} and {@link 1801 * CharEscapers#javaCharEscaper()} or {@link CharEscapers#javaStringEscaper()} 1802 * instead. This method combines two forms of escaping in a way that's rarely 1803 * desired. 1804 */ 1805 @Deprecated 1806 public static String javaEscape(String s) { 1807 return JAVA_ESCAPE.escape(s); 1808 } 1809 1810 // Java escaper. 1811 private static final CharEscaper JAVA_ESCAPE = 1812 new CharEscaperBuilder() 1813 .addEscape('\n', "\\n") 1814 .addEscape('\r', "\\r") 1815 .addEscape('\t', "\\t") 1816 .addEscape('\\', "\\\\") 1817 .addEscape('\"', "\\\"") 1818 .addEscape('&', "&") 1819 .addEscape('<', "<") 1820 .addEscape('>', ">") 1821 .addEscape('\'', "\\\'") 1822 .toEscaper(); 1823 1824 /** 1825 * Escapes the special characters from a string so it can be used as part of 1826 * a regex pattern. This method is for use on gnu.regexp style regular 1827 * expressions. 1828 * 1829 * @deprecated Use {@link Pattern#quote(String)} instead. Note that it may not 1830 * be compatible with gnu.regexp style regular expressions. 1831 */ 1832 @Deprecated 1833 public static String regexEscape(String s) { 1834 return REGEX_ESCAPE.escape(s); 1835 } 1836 1837 // Regex escaper escapes all regex characters. 1838 private static final CharEscaper REGEX_ESCAPE = 1839 new CharEscaperBuilder() 1840 .addEscape('(', "\\(") 1841 .addEscape(')', "\\)") 1842 .addEscape('|', "\\|") 1843 .addEscape('*', "\\*") 1844 .addEscape('+', "\\+") 1845 .addEscape('?', "\\?") 1846 .addEscape('.', "\\.") 1847 .addEscape('{', "\\{") 1848 .addEscape('}', "\\}") 1849 .addEscape('[', "\\[") 1850 .addEscape(']', "\\]") 1851 .addEscape('$', "\\$") 1852 .addEscape('^', "\\^") 1853 .addEscape('\\', "\\\\") 1854 .toEscaper(); 1855 1856 /** 1857 * If you want to preserve the exact 1858 * current (odd) behavior when {@code doStrip} is {@code true}, use 1859 * {@code .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on 1860 * the splitter. 1861 * 1862 * @param in what to process 1863 * @param delimiter the delimiting string 1864 * @return the tokens 1865 * @deprecated see the detailed instructions under 1866 * {@link #split(String, String, boolean)} 1867 */ 1868 @Deprecated 1869 public static LinkedList<String> string2List( 1870 String in, String delimiter, boolean doStrip) { 1871 if (in == null) { 1872 return null; 1873 } 1874 1875 LinkedList<String> out = new LinkedList<String>(); 1876 string2Collection(in, delimiter, doStrip, out); 1877 return out; 1878 } 1879 1880 /** 1881 * See the detailed instructions under {@link 1882 * #split(String, String, boolean)}. Pass the resulting {@code Iterable} to 1883 * {@link com.google.common.collect.Sets#newHashSet(Iterable)}. If you want to 1884 * preserve the exact current (odd) behavior when {@code doStrip} is {@code 1885 * true}, use {@code 1886 * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the 1887 * splitter. 1888 * 1889 * @param in what to process 1890 * @param delimiter the delimiting string 1891 * @param doStrip to strip the substrings before adding to the list 1892 * @return the tokens 1893 * @deprecated see the detailed instructions under 1894 * {@link #split(String, String, boolean)} 1895 */ 1896 @Deprecated 1897 public static Set<String> string2Set( 1898 String in, String delimiter, boolean doStrip) { 1899 if (in == null) { 1900 return null; 1901 } 1902 1903 HashSet<String> out = new HashSet<String>(); 1904 string2Collection(in, delimiter, doStrip, out); 1905 return out; 1906 } 1907 1908 /** 1909 * See the detailed instructions under {@link 1910 * #split(String, String, boolean)}. If you want to preserve the exact current 1911 * (odd) behavior when {@code doStrip} is {@code true}, use {@code 1912 * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the 1913 * splitter. 1914 * 1915 * @param in The delimited input string to process 1916 * @param delimiter The string delimiting entries in the input string. 1917 * @param doStrip whether to strip the substrings before adding to the 1918 * collection 1919 * @param collection The collection to which the strings will be added. If 1920 * <code>null</code>, a new <code>List</code> will be created. 1921 * @return The collection to which the substrings were added. This is 1922 * syntactic sugar to allow call chaining. 1923 * @deprecated see the detailed instructions under 1924 * {@link #split(String, String, boolean)} 1925 */ 1926 @Deprecated 1927 public static Collection<String> string2Collection( 1928 String in, 1929 String delimiter, 1930 boolean doStrip, 1931 Collection<String> collection) { 1932 if (in == null) { 1933 return null; 1934 } 1935 if (collection == null) { 1936 collection = new ArrayList<String>(); 1937 } 1938 if (delimiter == null || delimiter.length() == 0) { 1939 collection.add(in); 1940 return collection; 1941 } 1942 1943 int fromIndex = 0; 1944 int pos; 1945 while ((pos = in.indexOf(delimiter, fromIndex)) >= 0) { 1946 String interim = in.substring(fromIndex, pos); 1947 if (doStrip) { 1948 interim = strip(interim); 1949 } 1950 if (!doStrip || interim.length() > 0) { 1951 collection.add(interim); 1952 } 1953 1954 fromIndex = pos + delimiter.length(); 1955 } 1956 1957 String interim = in.substring(fromIndex); 1958 if (doStrip) { 1959 interim = strip(interim); 1960 } 1961 if (!doStrip || interim.length() > 0) { 1962 collection.add(interim); 1963 } 1964 1965 return collection; 1966 } 1967 1968 /** 1969 * This converts a string to a Map. It will first split the string into 1970 * entries using delimEntry. Then each entry is split into a key and a value 1971 * using delimKey. By default we strip the keys. Use doStripEntry to strip 1972 * also the entries. 1973 * 1974 * Note that this method returns a {@link HashMap}, which means that entries 1975 * will be in no particular order. See {@link #stringToOrderedMap}. 1976 * 1977 * @param in the string to be processed 1978 * @param delimEntry delimiter for the entries 1979 * @param delimKey delimiter between keys and values 1980 * @param doStripEntry strip entries before inserting in the map 1981 * 1982 * @return HashMap 1983 */ string2Map( String in, String delimEntry, String delimKey, boolean doStripEntry)1984 public static HashMap<String, String> string2Map( 1985 String in, String delimEntry, String delimKey, 1986 boolean doStripEntry) { 1987 if (in == null) { 1988 return null; 1989 } 1990 1991 return stringToMapImpl(new HashMap<String, String>(), in, delimEntry, 1992 delimKey, doStripEntry); 1993 } 1994 1995 /** 1996 * This converts a string to a Map, with entries in the same order as the 1997 * key/value pairs in the input string. It will first split the string into 1998 * entries using delimEntry. Then each entry is split into a key and a value 1999 * using delimKey. By default we strip the keys. Use doStripEntry to strip 2000 * also the entries. 2001 * 2002 * @param in the string to be processed 2003 * @param delimEntry delimiter for the entries 2004 * @param delimKey delimiter between keys and values 2005 * @param doStripEntry strip entries before inserting in the map 2006 * 2007 * @return key/value pairs as a Map, in order 2008 */ stringToOrderedMap( String in, String delimEntry, String delimKey, boolean doStripEntry)2009 public static Map<String, String> stringToOrderedMap( 2010 String in, String delimEntry, String delimKey, 2011 boolean doStripEntry) { 2012 if (in == null) { 2013 return null; 2014 } 2015 2016 return stringToMapImpl(new LinkedHashMap<String, String>(), in, delimEntry, 2017 delimKey, doStripEntry); 2018 } 2019 2020 /** 2021 * This adds key/value pairs from the given string to the given Map. 2022 * It will first split the string into entries using delimEntry. Then each 2023 * entry is split into a key and a value using delimKey. By default we 2024 * strip the keys. Use doStripEntry to strip also the entries. 2025 * 2026 * @param out - Map to output into 2027 * @param in - the string to be processed 2028 * @param delimEntry - delimiter for the entries 2029 * @param delimKey - delimiter between keys and values 2030 * @param doStripEntry - strip entries before inserting in the map 2031 * @return out, for caller's convenience 2032 */ stringToMapImpl(T out, String in, String delimEntry, String delimKey, boolean doStripEntry)2033 private static <T extends Map<String, String>> T stringToMapImpl(T out, 2034 String in, String delimEntry, String delimKey, boolean doStripEntry) { 2035 2036 if (isEmpty(delimEntry) || isEmpty(delimKey)) { 2037 out.put(strip(in), ""); 2038 return out; 2039 } 2040 2041 Iterator<String> it = string2List(in, delimEntry, false).iterator(); 2042 int len = delimKey.length(); 2043 while (it.hasNext()) { 2044 String entry = it.next(); 2045 int pos = entry.indexOf(delimKey); 2046 if (pos > 0) { 2047 String value = entry.substring(pos + len); 2048 if (doStripEntry) { 2049 value = strip(value); 2050 } 2051 out.put(strip(entry.substring(0, pos)), value); 2052 } else { 2053 out.put(strip(entry), ""); 2054 } 2055 } 2056 2057 return out; 2058 } 2059 2060 /** 2061 * This function concatenates the elements of a Map in a string with form 2062 * "<key1><sepKey><value1><sepEntry>...<keyN><sepKey><valueN>" 2063 * 2064 * @param in - the map to be converted 2065 * @param sepKey - the separator to put between key and value 2066 * @param sepEntry - the separator to put between map entries 2067 * @return String 2068 * @deprecated create a {@link MapJoiner}, for example {@code 2069 * Joiner.on(sepEntry).withKeyValueSeparator(sepKey)}. Ensure that your 2070 * map is non-null and use this map joiner's {@link MapJoiner#join(Map)} 2071 * method. To preserve behavior exactly, just in-line this method call. 2072 */ map2String( Map<K, V> in, String sepKey, String sepEntry)2073 @Deprecated public static <K, V> String map2String( 2074 Map<K, V> in, String sepKey, String sepEntry) { 2075 return (in == null) ? null : Joiner 2076 .on(sepEntry) 2077 .useForNull("null") 2078 .withKeyValueSeparator(sepKey) 2079 .join(in); 2080 } 2081 2082 /** 2083 * Given a map, creates and returns a new map in which all keys are the 2084 * lower-cased version of each key. 2085 * 2086 * @param map A map containing String keys to be lowercased 2087 * @throws IllegalArgumentException if the map contains duplicate string keys 2088 * after lower casing 2089 */ lowercaseKeys(Map<String, V> map)2090 public static <V> Map<String, V> lowercaseKeys(Map<String, V> map) { 2091 Map<String, V> result = new HashMap<String, V>(map.size()); 2092 for (Map.Entry<String, V> entry : map.entrySet()) { 2093 String key = entry.getKey(); 2094 if (result.containsKey(key.toLowerCase())) { 2095 throw new IllegalArgumentException( 2096 "Duplicate string key in map when lower casing"); 2097 } 2098 result.put(key.toLowerCase(), entry.getValue()); 2099 } 2100 return result; 2101 } 2102 2103 /** 2104 * Replaces any string of adjacent whitespace characters with the whitespace 2105 * character " ". 2106 * 2107 * @param str the string you want to munge 2108 * @return String with no more excessive whitespace! 2109 * @deprecated ensure the string is not null and use {@code 2110 * CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ')}; also consider 2111 * whether you really want the legacy whitespace definition, or something 2112 * more standard like {@link CharMatcher#WHITESPACE}. 2113 */ collapseWhitespace(String str)2114 @Deprecated public static String collapseWhitespace(String str) { 2115 return (str == null) ? null 2116 : CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' '); 2117 } 2118 2119 /** 2120 * Replaces any string of matched characters with the supplied string.<p> 2121 * 2122 * This is a more general version of collapseWhitespace. 2123 * 2124 * <pre> 2125 * E.g. collapse("hello world", " ", "::") 2126 * will return the following string: "hello::world" 2127 * </pre> 2128 * 2129 * @param str the string you want to munge 2130 * @param chars all of the characters to be considered for munge 2131 * @param replacement the replacement string 2132 * @return munged and replaced string. 2133 * @deprecated if {@code replacement} is the empty string, use {@link 2134 * CharMatcher#removeFrom(CharSequence)}; if it is a single character, 2135 * use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer 2136 * replacement strings use {@link String#replaceAll(String, String)} with 2137 * a regular expression that matches one or more occurrences of {@code 2138 * chars}. In all cases you must first ensure that {@code str} is not 2139 * null. 2140 */ collapse( String str, String chars, String replacement)2141 @Deprecated public static String collapse( 2142 String str, String chars, String replacement) { 2143 if (str == null) { 2144 return null; 2145 } 2146 2147 StringBuilder newStr = new StringBuilder(); 2148 2149 boolean prevCharMatched = false; 2150 char c; 2151 for (int i = 0; i < str.length(); i++) { 2152 c = str.charAt(i); 2153 if (chars.indexOf(c) != -1) { 2154 // this character is matched 2155 if (prevCharMatched) { 2156 // apparently a string of matched chars, so don't append anything 2157 // to the string 2158 continue; 2159 } 2160 prevCharMatched = true; 2161 newStr.append(replacement); 2162 } else { 2163 prevCharMatched = false; 2164 newStr.append(c); 2165 } 2166 } 2167 2168 return newStr.toString(); 2169 } 2170 2171 /** 2172 * Returns a string with all sequences of ISO control chars (0x00 to 0x1F and 2173 * 0x7F to 0x9F) replaced by the supplied string. ISO control characters are 2174 * identified via {@link Character#isISOControl(char)}. 2175 * 2176 * @param str the string you want to strip of ISO control chars 2177 * @param replacement the replacement string 2178 * @return a String with all control characters replaced by the replacement 2179 * string, or null if input is null. 2180 * @deprecated use {@link CharMatcher#JAVA_ISO_CONTROL}. If {@code 2181 * replacement} is the empty string, use {@link 2182 * CharMatcher#removeFrom(CharSequence)}; if it is a single character, 2183 * use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer 2184 * replacement strings use 2185 * {@code str.replaceAll("\p{Cntrl}+", replacement)}. 2186 * In all cases you must first ensure that {@code str} is not null. 2187 */ collapseControlChars( String str, String replacement)2188 @Deprecated public static String collapseControlChars( 2189 String str, String replacement) { 2190 /* 2191 * We re-implement the StringUtil.collapse() loop here rather than call 2192 * collapse() with an input String of control chars, because matching via 2193 * isISOControl() is about 10x faster. 2194 */ 2195 if (str == null) { 2196 return null; 2197 } 2198 2199 StringBuilder newStr = new StringBuilder(); 2200 2201 boolean prevCharMatched = false; 2202 char c; 2203 for (int i = 0; i < str.length(); i++) { 2204 c = str.charAt(i); 2205 if (Character.isISOControl(c)) { 2206 // this character is matched 2207 if (prevCharMatched) { 2208 // apparently a string of matched chars, so don't append anything 2209 // to the string 2210 continue; 2211 } 2212 prevCharMatched = true; 2213 newStr.append(replacement); 2214 } else { 2215 prevCharMatched = false; 2216 newStr.append(c); 2217 } 2218 } 2219 2220 return newStr.toString(); 2221 } 2222 2223 /** 2224 * Read a String of up to maxLength bytes from an InputStream. 2225 * 2226 * <p>Note that this method uses the default platform encoding, and expects 2227 * that encoding to be single-byte, which is not always the case. Its use 2228 * is discouraged. For reading the entire stream (maxLength == -1) you can use: 2229 * <pre> 2230 * CharStreams.toString(new InputStreamReader(is, Charsets.ISO_8859_1)) 2231 * </pre> 2232 * {@code CharStreams} is in the {@code com.google.common.io} package. 2233 * 2234 * <p>For maxLength >= 0 a literal translation would be 2235 * <pre> 2236 * CharStreams.toString(new InputStreamReader( 2237 * new LimitInputStream(is, maxLength), Charsets.ISO_8859_1)) 2238 * </pre> 2239 * For multi-byte encodings that is broken because the limit could end in 2240 * the middle of the character--it would be better to limit the reader than 2241 * the underlying stream. 2242 * 2243 * @param is input stream 2244 * @param maxLength max number of bytes to read from "is". If this is -1, we 2245 * read everything. 2246 * 2247 * @return String up to maxLength bytes, read from "is" 2248 * @deprecated see the advice above 2249 */ stream2String(InputStream is, int maxLength)2250 @Deprecated public static String stream2String(InputStream is, int maxLength) 2251 throws IOException { 2252 byte[] buffer = new byte[4096]; 2253 StringWriter sw = new StringWriter(); 2254 int totalRead = 0; 2255 int read = 0; 2256 2257 do { 2258 sw.write(new String(buffer, 0, read)); 2259 totalRead += read; 2260 read = is.read(buffer, 0, buffer.length); 2261 } while (((-1 == maxLength) || (totalRead < maxLength)) && (read != -1)); 2262 2263 return sw.toString(); 2264 } 2265 2266 /** 2267 * Parse a list of substrings separated by a given delimiter. The delimiter 2268 * can also appear in substrings (just double them): 2269 * 2270 * parseDelimitedString("this|is", '|') returns ["this","is"] 2271 * parseDelimitedString("this||is", '|') returns ["this|is"] 2272 * 2273 * @param list String containing delimited substrings 2274 * @param delimiter Delimiter (anything except ' ' is allowed) 2275 * 2276 * @return String[] A String array of parsed substrings 2277 */ parseDelimitedList(String list, char delimiter)2278 public static String[] parseDelimitedList(String list, 2279 char delimiter) { 2280 String delim = "" + delimiter; 2281 // Append a sentinel of delimiter + space 2282 // (see comments below for more info) 2283 StringTokenizer st = new StringTokenizer(list + delim + " ", 2284 delim, 2285 true); 2286 ArrayList<String> v = new ArrayList<String>(); 2287 String lastToken = ""; 2288 StringBuilder word = new StringBuilder(); 2289 2290 // We keep a sliding window of 2 tokens 2291 // 2292 // delimiter : delimiter -> append delimiter to current word 2293 // and clear most recent token 2294 // (so delim : delim : delim will not 2295 // be treated as two escaped delims.) 2296 // 2297 // tok : delimiter -> append tok to current word 2298 // 2299 // delimiter : tok -> add current word to list, and clear it. 2300 // (We append a sentinel that conforms to this 2301 // pattern to make sure we've pushed every parsed token) 2302 while (st.hasMoreTokens()) { 2303 String tok = st.nextToken(); 2304 if (lastToken != null) { 2305 if (tok.equals(delim)) { 2306 word.append(lastToken); 2307 if (lastToken.equals(delim)) { tok = null; } 2308 } else { 2309 if (word.length() != 0) { 2310 v.add(word.toString()); 2311 } 2312 word.setLength(0); 2313 } 2314 } 2315 lastToken = tok; 2316 } 2317 2318 return v.toArray(new String[0]); 2319 } 2320 2321 /** 2322 * Compares two strings, guarding against nulls. 2323 * 2324 * @param nullsAreGreater true if nulls should be greater than any string, 2325 * false is less than. 2326 * @deprecated use {@link String#CASE_INSENSITIVE_ORDER}, together with 2327 * {@link com.google.common.collect.Ordering#nullsFirst()} or 2328 * {@link com.google.common.collect.Ordering#nullsLast()} if 2329 * needed 2330 */ compareToIgnoreCase(String s1, String s2, boolean nullsAreGreater)2331 @Deprecated public static int compareToIgnoreCase(String s1, String s2, 2332 boolean nullsAreGreater) { 2333 if (s1 == s2) { 2334 return 0; // Either both the same String, or both null 2335 } 2336 if (s1 == null) { 2337 return nullsAreGreater ? 1 : -1; 2338 } 2339 if (s2 == null) { 2340 return nullsAreGreater ? -1 : 1; 2341 } 2342 return s1.compareToIgnoreCase(s2); 2343 } 2344 2345 /** 2346 * Splits s with delimiters in delimiter and returns the last token 2347 */ lastToken(String s, String delimiter)2348 public static String lastToken(String s, String delimiter) { 2349 return s.substring(CharMatcher.anyOf(delimiter).lastIndexIn(s) + 1); 2350 } 2351 2352 private static final Pattern characterReferencePattern = 2353 Pattern.compile("&#?[a-zA-Z0-9]{1,8};"); 2354 2355 /** 2356 * Determines if a string contains what looks like an html character 2357 * reference. Useful for deciding whether unescaping is necessary. 2358 */ containsCharRef(String s)2359 public static boolean containsCharRef(String s) { 2360 return characterReferencePattern.matcher(s).find(); 2361 } 2362 2363 /** 2364 * Determines if a string is a Hebrew word. A string is considered to be 2365 * a Hebrew word if {@link #isHebrew(int)} is true for any of its characters. 2366 */ isHebrew(String s)2367 public static boolean isHebrew(String s) { 2368 int len = s.length(); 2369 for (int i = 0; i < len; ++i) { 2370 if (isHebrew(s.codePointAt(i))) { 2371 return true; 2372 } 2373 } 2374 return false; 2375 } 2376 2377 /** 2378 * Determines if a character is a Hebrew character. 2379 */ isHebrew(int codePoint)2380 public static boolean isHebrew(int codePoint) { 2381 return Character.UnicodeBlock.HEBREW.equals( 2382 Character.UnicodeBlock.of(codePoint)); 2383 } 2384 2385 /** 2386 * Determines if a string is a CJK word. A string is considered to be CJK 2387 * if {@link #isCjk(char)} is true for any of its characters. 2388 */ isCjk(String s)2389 public static boolean isCjk(String s) { 2390 int len = s.length(); 2391 for (int i = 0; i < len; ++i) { 2392 if (isCjk(s.codePointAt(i))) { 2393 return true; 2394 } 2395 } 2396 return false; 2397 } 2398 2399 /** 2400 * Unicode code blocks containing CJK characters. 2401 */ 2402 private static final Set<Character.UnicodeBlock> CJK_BLOCKS; 2403 static { 2404 Set<Character.UnicodeBlock> set = new HashSet<Character.UnicodeBlock>(); 2405 set.add(Character.UnicodeBlock.HANGUL_JAMO); 2406 set.add(Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT); 2407 set.add(Character.UnicodeBlock.KANGXI_RADICALS); 2408 set.add(Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION); 2409 set.add(Character.UnicodeBlock.HIRAGANA); 2410 set.add(Character.UnicodeBlock.KATAKANA); 2411 set.add(Character.UnicodeBlock.BOPOMOFO); 2412 set.add(Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO); 2413 set.add(Character.UnicodeBlock.KANBUN); 2414 set.add(Character.UnicodeBlock.BOPOMOFO_EXTENDED); 2415 set.add(Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS); 2416 set.add(Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS); 2417 set.add(Character.UnicodeBlock.CJK_COMPATIBILITY); 2418 set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A); 2419 set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS); 2420 set.add(Character.UnicodeBlock.HANGUL_SYLLABLES); 2421 set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS); 2422 set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS); 2423 set.add(Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS); 2424 set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B); 2425 set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT); 2426 CJK_BLOCKS = Collections.unmodifiableSet(set); 2427 } 2428 2429 /** 2430 * Determines if a character is a CJK ideograph or a character typically 2431 * used only in CJK text. 2432 * 2433 * Note: This function cannot handle supplementary characters. To handle all 2434 * Unicode characters, including supplementary characters, use the function 2435 * {@link #isCjk(int)}. 2436 */ isCjk(char ch)2437 public static boolean isCjk(char ch) { 2438 return isCjk((int) ch); 2439 } 2440 2441 /** 2442 * Determines if a character is a CJK ideograph or a character typically 2443 * used only in CJK text. 2444 */ isCjk(int codePoint)2445 public static boolean isCjk(int codePoint) { 2446 // Time-saving early exit for all Latin-1 characters. 2447 if ((codePoint & 0xFFFFFF00) == 0) { 2448 return false; 2449 } 2450 2451 return CJK_BLOCKS.contains(Character.UnicodeBlock.of(codePoint)); 2452 } 2453 2454 /** 2455 * Returns the approximate display width of the string, measured in units of 2456 * ascii characters. 2457 * 2458 * @see StringUtil#displayWidth(char) 2459 */ displayWidth(String s)2460 public static int displayWidth(String s) { 2461 // TODO(kevinb): could reimplement this as 2462 // return s.length() * 2 - CharMatcher.SINGLE_WIDTH.countIn(s); 2463 int width = 0; 2464 int len = s.length(); 2465 for (int i = 0; i < len; ++i) { 2466 width += displayWidth(s.charAt(i)); 2467 } 2468 return width; 2469 } 2470 2471 /** 2472 * Returns the approximate display width of the character, measured 2473 * in units of ascii characters. 2474 * 2475 * This method should err on the side of caution. By default, characters 2476 * are assumed to have width 2; this covers CJK ideographs, various 2477 * symbols and miscellaneous weird scripts. Given below are some Unicode 2478 * ranges for which it seems safe to assume that no character is 2479 * substantially wider than an ascii character: 2480 * - Latin, extended Latin, even more extended Latin. 2481 * - Greek, extended Greek, Cyrillic. 2482 * - Some symbols (including currency symbols) and punctuation. 2483 * - Half-width Katakana and Hangul. 2484 * - Hebrew 2485 * - Arabic 2486 * - Thai 2487 * Characters in these ranges are given a width of 1. 2488 * 2489 * IMPORTANT: this function has analogs in C++ (encodingutils.cc, 2490 * named UnicodeCharWidth) and JavaScript 2491 * (java/com/google/ads/common/frontend/adwordsbase/resources/CreateAdUtil.js), 2492 * which need to be updated if you change the implementation here. 2493 */ displayWidth(char ch)2494 public static int displayWidth(char ch) { 2495 if (ch <= '\u04f9' || // CYRILLIC SMALL LETTER YERU WITH DIAERESIS 2496 ch == '\u05be' || // HEBREW PUNCTUATION MAQAF 2497 (ch >= '\u05d0' && ch <= '\u05ea') || // HEBREW LETTER ALEF ... TAV 2498 ch == '\u05F3' || // HEBREW PUNCTUATION GERESH 2499 ch == '\u05f4' || // HEBREW PUNCTUATION GERSHAYIM 2500 (ch >= '\u0600' && ch <= '\u06ff') || // Block=Arabic 2501 (ch >= '\u0750' && ch <= '\u077f') || // Block=Arabic_Supplement 2502 (ch >= '\ufb50' && ch <= '\ufdff') || // Block=Arabic_Presentation_Forms-A 2503 (ch >= '\ufe70' && ch <= '\ufeff') || // Block=Arabic_Presentation_Forms-B 2504 (ch >= '\u1e00' && ch <= '\u20af') || /* LATIN CAPITAL LETTER A WITH RING BELOW 2505 ... DRACHMA SIGN */ 2506 (ch >= '\u2100' && ch <= '\u213a') || // ACCOUNT OF ... ROTATED CAPITAL Q 2507 (ch >= '\u0e00' && ch <= '\u0e7f') || // Thai 2508 (ch >= '\uff61' && ch <= '\uffdc')) { /* HALFWIDTH IDEOGRAPHIC FULL STOP 2509 ... HALFWIDTH HANGUL LETTER I */ 2510 return 1; 2511 } 2512 return 2; 2513 } 2514 2515 /** 2516 * @return a string representation of the given native array. 2517 */ toString(float[] iArray)2518 public static String toString(float[] iArray) { 2519 if (iArray == null) { 2520 return "NULL"; 2521 } 2522 2523 StringBuilder buffer = new StringBuilder(); 2524 buffer.append("["); 2525 for (int i = 0; i < iArray.length; i++) { 2526 buffer.append(iArray[i]); 2527 if (i != (iArray.length - 1)) { 2528 buffer.append(", "); 2529 } 2530 } 2531 buffer.append("]"); 2532 return buffer.toString(); 2533 } 2534 2535 /** 2536 * @return a string representation of the given native array. 2537 */ toString(long[] iArray)2538 public static String toString(long[] iArray) { 2539 if (iArray == null) { 2540 return "NULL"; 2541 } 2542 2543 StringBuilder buffer = new StringBuilder(); 2544 buffer.append("["); 2545 for (int i = 0; i < iArray.length; i++) { 2546 buffer.append(iArray[i]); 2547 if (i != (iArray.length - 1)) { 2548 buffer.append(", "); 2549 } 2550 } 2551 buffer.append("]"); 2552 return buffer.toString(); 2553 } 2554 2555 /** 2556 * @return a string representation of the given native array 2557 */ toString(int[] iArray)2558 public static String toString(int[] iArray) { 2559 if (iArray == null) { 2560 return "NULL"; 2561 } 2562 2563 StringBuilder buffer = new StringBuilder(); 2564 buffer.append("["); 2565 for (int i = 0; i < iArray.length; i++) { 2566 buffer.append(iArray[i]); 2567 if (i != (iArray.length - 1)) { 2568 buffer.append(", "); 2569 } 2570 } 2571 buffer.append("]"); 2572 return buffer.toString(); 2573 } 2574 2575 /** 2576 * @return a string representation of the given array. 2577 */ toString(String[] iArray)2578 public static String toString(String[] iArray) { 2579 if (iArray == null) { return "NULL"; } 2580 2581 StringBuilder buffer = new StringBuilder(); 2582 buffer.append("["); 2583 for (int i = 0; i < iArray.length; i++) { 2584 buffer.append("'").append(iArray[i]).append("'"); 2585 if (i != iArray.length - 1) { 2586 buffer.append(", "); 2587 } 2588 } 2589 buffer.append("]"); 2590 2591 return buffer.toString(); 2592 } 2593 2594 /** 2595 * Returns the string, in single quotes, or "NULL". Intended only for 2596 * logging. 2597 * 2598 * @param s the string 2599 * @return the string, in single quotes, or the string "null" if it's null. 2600 */ toString(String s)2601 public static String toString(String s) { 2602 if (s == null) { 2603 return "NULL"; 2604 } else { 2605 return new StringBuilder(s.length() + 2).append("'").append(s) 2606 .append("'").toString(); 2607 } 2608 } 2609 2610 /** 2611 * @return a string representation of the given native array 2612 */ toString(int[][] iArray)2613 public static String toString(int[][] iArray) { 2614 if (iArray == null) { 2615 return "NULL"; 2616 } 2617 2618 StringBuilder buffer = new StringBuilder(); 2619 buffer.append("["); 2620 for (int i = 0; i < iArray.length; i++) { 2621 buffer.append("["); 2622 for (int j = 0; j < iArray[i].length; j++) { 2623 buffer.append(iArray[i][j]); 2624 if (j != (iArray[i].length - 1)) { 2625 buffer.append(", "); 2626 } 2627 } 2628 buffer.append("]"); 2629 if (i != iArray.length - 1) { 2630 buffer.append(" "); 2631 } 2632 } 2633 buffer.append("]"); 2634 return buffer.toString(); 2635 } 2636 2637 /** 2638 * @return a string representation of the given native array. 2639 */ toString(long[][] iArray)2640 public static String toString(long[][] iArray) { 2641 if (iArray == null) { return "NULL"; } 2642 2643 StringBuilder buffer = new StringBuilder(); 2644 buffer.append("["); 2645 for (int i = 0; i < iArray.length; i++) { 2646 buffer.append("["); 2647 for (int j = 0; j < iArray[i].length; j++) { 2648 buffer.append(iArray[i][j]); 2649 if (j != (iArray[i].length - 1)) { 2650 buffer.append(", "); 2651 } 2652 } 2653 buffer.append("]"); 2654 if (i != iArray.length - 1) { 2655 buffer.append(" "); 2656 } 2657 } 2658 buffer.append("]"); 2659 return buffer.toString(); 2660 } 2661 2662 /** 2663 * @return a String representation of the given object array. 2664 * The strings are obtained by calling toString() on the 2665 * underlying objects. 2666 */ toString(Object[] obj)2667 public static String toString(Object[] obj) { 2668 if (obj == null) { return "NULL"; } 2669 StringBuilder tmp = new StringBuilder(); 2670 tmp.append("["); 2671 for (int i = 0; i < obj.length; i++) { 2672 tmp.append(obj[i].toString()); 2673 if (i != obj.length - 1) { 2674 tmp.append(","); 2675 } 2676 } 2677 tmp.append("]"); 2678 return tmp.toString(); 2679 } 2680 2681 private static final char[] HEX_CHARS 2682 = { '0', '1', '2', '3', '4', '5', '6', '7', 2683 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; 2684 private static final char[] OCTAL_CHARS = HEX_CHARS; // ignore the last 8 :) 2685 2686 /** 2687 * Convert a byte array to a hex-encoding string: "a33bff00..." 2688 * 2689 * @deprecated Use {@link ByteArrays#toHexString}. 2690 */ bytesToHexString(final byte[] bytes)2691 @Deprecated public static String bytesToHexString(final byte[] bytes) { 2692 return ByteArrays.toHexString(bytes); 2693 } 2694 2695 /** 2696 * Convert a byte array to a hex-encoding string with the specified 2697 * delimiter: "a3<delimiter>3b<delimiter>ff..." 2698 */ bytesToHexString(final byte[] bytes, Character delimiter)2699 public static String bytesToHexString(final byte[] bytes, 2700 Character delimiter) { 2701 StringBuilder hex = 2702 new StringBuilder(bytes.length * (delimiter == null ? 2 : 3)); 2703 int nibble1, nibble2; 2704 for (int i = 0; i < bytes.length; i++) { 2705 nibble1 = (bytes[i] >>> 4) & 0xf; 2706 nibble2 = bytes[i] & 0xf; 2707 if (i > 0 && delimiter != null) { hex.append(delimiter.charValue()); } 2708 hex.append(HEX_CHARS[nibble1]); 2709 hex.append(HEX_CHARS[nibble2]); 2710 } 2711 return hex.toString(); 2712 } 2713 2714 /** 2715 * Safely convert the string to uppercase. 2716 * @return upper case representation of the String; or null if 2717 * the input string is null. 2718 */ toUpperCase(String src)2719 public static String toUpperCase(String src) { 2720 if (src == null) { 2721 return null; 2722 } else { 2723 return src.toUpperCase(); 2724 } 2725 } 2726 2727 /** 2728 * Safely convert the string to lowercase. 2729 * @return lower case representation of the String; or null if 2730 * the input string is null. 2731 */ toLowerCase(String src)2732 public static String toLowerCase(String src) { 2733 if (src == null) { 2734 return null; 2735 } else { 2736 return src.toLowerCase(); 2737 } 2738 } 2739 2740 private static final Pattern dbSpecPattern = 2741 Pattern.compile("(.*)\\{(\\d+),(\\d+)\\}(.*)"); 2742 2743 /** 2744 * @param dbSpecComponent a single component of a DBDescriptor spec 2745 * (e.g. the host or database component). The expected format of the string is: 2746 * <br> 2747 * <center>(prefix){(digits),(digits)}(suffix)</center> 2748 * </br> 2749 * @return a shard expansion of the given String. 2750 * Note that unless the pattern is matched exactly, no expansion is 2751 * performed and the original string is returned unaltered. 2752 * For example, 'db{0,1}.adz' is expanded into 'db0.adz, db1.adz'. 2753 * Note that this method is added to StringUtil instead of 2754 * DBDescriptor to better encapsulate the choice of regexp implementation. 2755 * @throws IllegalArgumentException if the string does not parse. 2756 */ expandShardNames(String dbSpecComponent)2757 public static String expandShardNames(String dbSpecComponent) 2758 throws IllegalArgumentException, IllegalStateException { 2759 2760 Matcher matcher = dbSpecPattern.matcher(dbSpecComponent); 2761 if (matcher.find()) { 2762 try { 2763 String prefix = dbSpecComponent.substring( 2764 matcher.start(1), matcher.end(1)); 2765 int minShard = 2766 Integer.parseInt( 2767 dbSpecComponent.substring( 2768 matcher.start(2), matcher.end(2))); 2769 int maxShard = 2770 Integer.parseInt( 2771 dbSpecComponent.substring( 2772 matcher.start(3), matcher.end(3))); 2773 String suffix = dbSpecComponent.substring( 2774 matcher.start(4), matcher.end(4)); 2775 //Log2.logEvent(prefix + " " + minShard + " " + maxShard + " " + suffix); 2776 if (minShard > maxShard) { 2777 throw new IllegalArgumentException( 2778 "Maximum shard must be greater than or equal to " + 2779 "the minimum shard"); 2780 } 2781 StringBuilder tmp = new StringBuilder(); 2782 for (int shard = minShard; shard <= maxShard; shard++) { 2783 tmp.append(prefix).append(shard).append(suffix); 2784 if (shard != maxShard) { 2785 tmp.append(","); 2786 } 2787 } 2788 return tmp.toString(); 2789 } catch (NumberFormatException nfex) { 2790 throw new IllegalArgumentException( 2791 "Malformed DB specification component: " + dbSpecComponent); 2792 } 2793 } else { 2794 return dbSpecComponent; 2795 } 2796 } 2797 2798 2799 /** 2800 * Returns a string that is equivalent to the specified string with its 2801 * first character converted to uppercase as by {@link String#toUpperCase()}. 2802 * The returned string will have the same value as the specified string if 2803 * its first character is non-alphabetic, if its first character is already 2804 * uppercase, or if the specified string is of length 0. 2805 * 2806 * <p>For example: 2807 * <pre> 2808 * capitalize("foo bar").equals("Foo bar"); 2809 * capitalize("2b or not 2b").equals("2b or not 2b") 2810 * capitalize("Foo bar").equals("Foo bar"); 2811 * capitalize("").equals(""); 2812 * </pre> 2813 * 2814 * @param s the string whose first character is to be uppercased 2815 * @return a string equivalent to <tt>s</tt> with its first character 2816 * converted to uppercase 2817 * @throws NullPointerException if <tt>s</tt> is null 2818 */ capitalize(String s)2819 public static String capitalize(String s) { 2820 if (s.length() == 0) { 2821 return s; 2822 } 2823 char first = s.charAt(0); 2824 char capitalized = Character.toUpperCase(first); 2825 return (first == capitalized) 2826 ? s 2827 : capitalized + s.substring(1); 2828 } 2829 2830 /** 2831 * Examine a string to see if it starts with a given prefix (case 2832 * insensitive). Just like String.startsWith() except doesn't 2833 * respect case. Strings are compared in the same way as in 2834 * {@link String#equalsIgnoreCase}. 2835 * 2836 * @param str the string to examine 2837 * @param prefix the prefix to look for 2838 * @return a boolean indicating if str starts with prefix (case insensitive) 2839 */ startsWithIgnoreCase(String str, String prefix)2840 public static boolean startsWithIgnoreCase(String str, String prefix) { 2841 return str.regionMatches(true, 0, prefix, 0, prefix.length()); 2842 } 2843 2844 /** 2845 * Examine a string to see if it ends with a given suffix (case 2846 * insensitive). Just like String.endsWith() except doesn't respect 2847 * case. Strings are compared in the same way as in 2848 * {@link String#equalsIgnoreCase}. 2849 * 2850 * @param str the string to examine 2851 * @param suffix the suffix to look for 2852 * @return a boolean indicating if str ends with suffix (case insensitive) 2853 */ endsWithIgnoreCase(String str, String suffix)2854 public static boolean endsWithIgnoreCase(String str, String suffix) { 2855 int len = suffix.length(); 2856 return str.regionMatches(true, str.length() - len, suffix, 0, len); 2857 } 2858 2859 /** 2860 * @param c one codePoint 2861 * @return the number of bytes needed to encode this codePoint in UTF-8 2862 */ bytesUtf8(int c)2863 private static int bytesUtf8(int c) { 2864 if (c < 0x80) { 2865 return 1; 2866 } else if (c < 0x00800) { 2867 return 2; 2868 } else if (c < 0x10000) { 2869 return 3; 2870 } else if (c < 0x200000) { 2871 return 4; 2872 2873 // RFC 3629 forbids the use of UTF-8 for codePoint greater than 0x10FFFF, 2874 // so if the caller respects this RFC, this should not happen 2875 } else if (c < 0x4000000) { 2876 return 5; 2877 } else { 2878 return 6; 2879 } 2880 } 2881 2882 /** 2883 * @param str a string 2884 * @return the number of bytes required to represent this string in UTF-8 2885 */ bytesStorage(String str)2886 public static int bytesStorage(String str) { 2887 // offsetByCodePoint has a bug if its argument is the result of a 2888 // call to substring. To avoid this, we create a new String 2889 // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664 2890 String s = new String(str); 2891 2892 int len = 0; 2893 for (int i = 0; i < s.length(); i = s.offsetByCodePoints(i, 1)) { 2894 len += bytesUtf8(s.codePointAt(i)); 2895 } 2896 return len; 2897 } 2898 2899 /** 2900 * @param str a string 2901 * @param maxbytes 2902 * @return the beginning of the string, so that it uses less than 2903 * maxbytes bytes in UTF-8 2904 * @throws IndexOutOfBoundsException if maxbytes is negative 2905 */ truncateStringForUtf8Storage(String str, int maxbytes)2906 public static String truncateStringForUtf8Storage(String str, int maxbytes) { 2907 if (maxbytes < 0) { 2908 throw new IndexOutOfBoundsException(); 2909 } 2910 2911 // offsetByCodePoint has a bug if its argument is the result of a 2912 // call to substring. To avoid this, we create a new String 2913 // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664 2914 // TODO(cquinn): should be fixed as of 1.5.0_01 2915 String s = new String(str); 2916 2917 int codepoints = 0; 2918 int bytesUsed = 0; 2919 for (codepoints = 0; codepoints < s.length(); 2920 codepoints = s.offsetByCodePoints(codepoints, 1)) { 2921 int glyphBytes = StringUtil.bytesUtf8(s.codePointAt(codepoints)); 2922 if (bytesUsed + glyphBytes > maxbytes) { 2923 break; 2924 } 2925 bytesUsed += glyphBytes; 2926 } 2927 return s.substring(0, codepoints); 2928 } 2929 2930 /** 2931 * If the given string is of length {@code maxLength} or less, then it is 2932 * returned as is. 2933 * If the string is longer than {@code maxLength}, the returned string is 2934 * truncated before the last space character on or before 2935 * {@code source.charAt(maxLength)}. If the string has no spaces, the 2936 * returned string is truncated to {@code maxLength}. 2937 * 2938 * @param source the string to truncate if necessary 2939 * @param maxLength 2940 * @return the original string if its length is less than or equal to 2941 * maxLength, otherwise a truncated string as mentioned above 2942 */ truncateIfNecessary(String source, int maxLength)2943 public static String truncateIfNecessary(String source, int maxLength) { 2944 if (source.length() <= maxLength) { 2945 return source; 2946 } 2947 String str = unicodePreservingSubstring(source, 0, maxLength); 2948 2949 @SuppressWarnings("deprecation") // we'll make this go away before that does 2950 CharMatcher whitespaceMatcher = CharMatcher.LEGACY_WHITESPACE; 2951 String truncated = whitespaceMatcher.trimTrailingFrom(str); 2952 2953 // We may have had multiple spaces at maxLength, which were stripped away 2954 if (truncated.length() < maxLength) { 2955 return truncated; 2956 } 2957 // We have a truncated string of length maxLength. If the next char was a 2958 // space, we truncated at a word boundary, so we can return immediately 2959 if (Character.isSpaceChar(source.charAt(maxLength))) { 2960 return truncated; 2961 } 2962 // We truncated in the middle of the word. Try to truncate before 2963 // the last space, if it exists. Otherwise, return the truncated string 2964 for (int i = truncated.length() - 1; i >= 0; --i) { 2965 if (Character.isSpaceChar(truncated.charAt(i))) { 2966 String substr = truncated.substring(0, i); 2967 return whitespaceMatcher.trimTrailingFrom(substr); 2968 } 2969 } 2970 return truncated; 2971 } 2972 2973 /** 2974 * If this given string is of length {@code maxLength} or less, it will 2975 * be returned as-is. 2976 * Otherwise it will be trucated to {@code maxLength}, regardless of whether 2977 * there are any space characters in the String. If an ellipsis is requested 2978 * to be appended to the truncated String, the String will be truncated so 2979 * that the ellipsis will also fit within maxLength. 2980 * If no truncation was necessary, no ellipsis will be added. 2981 * 2982 * @param source the String to truncate if necessary 2983 * @param maxLength the maximum number of characters to keep 2984 * @param addEllipsis if true, and if the String had to be truncated, 2985 * add "..." to the end of the String before returning. Additionally, 2986 * the ellipsis will only be added if maxLength is greater than 3. 2987 * @return the original string if its length is less than or equal to 2988 * maxLength, otherwise a truncated string as mentioned above 2989 */ truncateAtMaxLength(String source, int maxLength, boolean addEllipsis)2990 public static String truncateAtMaxLength(String source, int maxLength, 2991 boolean addEllipsis) { 2992 2993 if (source.length() <= maxLength) { 2994 return source; 2995 } 2996 if (addEllipsis && maxLength > 3) { 2997 return unicodePreservingSubstring(source, 0, maxLength - 3) + "..."; 2998 } 2999 return unicodePreservingSubstring(source, 0, maxLength); 3000 } 3001 3002 /** 3003 * Normalizes {@code index} such that it respects Unicode character 3004 * boundaries in {@code str}. 3005 * 3006 * <p>If {@code index} is the low surrogate of a unicode character, 3007 * the method returns {@code index - 1}. Otherwise, {@code index} is 3008 * returned. 3009 * 3010 * <p>In the case in which {@code index} falls in an invalid surrogate pair 3011 * (e.g. consecutive low surrogates, consecutive high surrogates), or if 3012 * if it is not a valid index into {@code str}, the original value of 3013 * {@code index} is returned. 3014 * 3015 * @param str the String 3016 * @param index the index to be normalized 3017 * @return a normalized index that does not split a Unicode character 3018 */ unicodePreservingIndex(String str, int index)3019 public static int unicodePreservingIndex(String str, int index) { 3020 if (index > 0 && index < str.length()) { 3021 if (Character.isHighSurrogate(str.charAt(index - 1)) && 3022 Character.isLowSurrogate(str.charAt(index))) { 3023 return index - 1; 3024 } 3025 } 3026 return index; 3027 } 3028 3029 /** 3030 * Returns a substring of {@code str} that respects Unicode character 3031 * boundaries. 3032 * 3033 * <p>The string will never be split between a [high, low] surrogate pair, 3034 * as defined by {@link Character#isHighSurrogate} and 3035 * {@link Character#isLowSurrogate}. 3036 * 3037 * <p>If {@code begin} or {@code end} are the low surrogate of a unicode 3038 * character, it will be offset by -1. 3039 * 3040 * <p>This behavior guarantees that 3041 * {@code str.equals(StringUtil.unicodePreservingSubstring(str, 0, n) + 3042 * StringUtil.unicodePreservingSubstring(str, n, str.length())) } is 3043 * true for all {@code n}. 3044 * </pre> 3045 * 3046 * <p>This means that unlike {@link String#substring(int, int)}, the length of 3047 * the returned substring may not necessarily be equivalent to 3048 * {@code end - begin}. 3049 * 3050 * @param str the original String 3051 * @param begin the beginning index, inclusive 3052 * @param end the ending index, exclusive 3053 * @return the specified substring, possibly adjusted in order to not 3054 * split unicode surrogate pairs 3055 * @throws IndexOutOfBoundsException if the {@code begin} is negative, 3056 * or {@code end} is larger than the length of {@code str}, or 3057 * {@code begin} is larger than {@code end} 3058 */ unicodePreservingSubstring( String str, int begin, int end)3059 public static String unicodePreservingSubstring( 3060 String str, int begin, int end) { 3061 return str.substring(unicodePreservingIndex(str, begin), 3062 unicodePreservingIndex(str, end)); 3063 } 3064 3065 /** 3066 * Equivalent to: 3067 * 3068 * <pre> 3069 * {@link #unicodePreservingSubstring(String, int, int)}( 3070 * str, begin, str.length()) 3071 * </pre> 3072 */ unicodePreservingSubstring(String str, int begin)3073 public static String unicodePreservingSubstring(String str, int begin) { 3074 return unicodePreservingSubstring(str, begin, str.length()); 3075 } 3076 3077 /** 3078 * True iff the given character needs to be escaped in a javascript string 3079 * literal. 3080 * <p> 3081 * We need to escape the following characters in javascript string literals. 3082 * <dl> 3083 * <dt> \ <dd> the escape character 3084 * <dt> ', " <dd> string delimiters. 3085 * TODO(msamuel): what about backticks (`) which are 3086 * non-standard but recognized as attribute delimiters. 3087 * <dt> &, <, >, = <dd> so that a string literal can be embedded in XHTML 3088 * without further escaping. 3089 * </dl> 3090 * TODO(msamuel): If we're being paranoid, should we escape + to avoid UTF-7 3091 * attacks? 3092 * <p> 3093 * Unicode format control characters (category Cf) must be escaped since they 3094 * are removed by javascript parser in a pre-lex pass. 3095 * <br>According to EcmaScript 262 Section 7.1: 3096 * <blockquote> 3097 * The format control characters can occur anywhere in the source text of 3098 * an ECMAScript program. These characters are removed from the source 3099 * text before applying the lexical grammar. 3100 * </blockquote> 3101 * <p> 3102 * Additionally, line terminators are not allowed to appear inside strings 3103 * and Section 7.3 says 3104 * <blockquote> 3105 * The following characters are considered to be line terminators:<pre> 3106 * Code Point Value Name Formal Name 3107 * \u000A Line Feed [LF] 3108 * \u000D Carriage Return [CR] 3109 * \u2028 Line separator [LS] 3110 * \u2029 Paragraph separator [PS] 3111 * </pre></blockquote> 3112 * 3113 * @param codepoint a char instead of an int since the javascript language 3114 * does not support extended unicode. 3115 */ mustEscapeCharInJsString(int codepoint)3116 static boolean mustEscapeCharInJsString(int codepoint) { 3117 return JS_ESCAPE_CHARS.contains(codepoint); 3118 } 3119 3120 /** 3121 * True iff the given character needs to be escaped in a JSON string literal. 3122 * <p> 3123 * We need to escape the following characters in JSON string literals. 3124 * <dl> 3125 * <dt> \ <dd> the escape character 3126 * <dt> " <dd> string delimiter 3127 * <dt> 0x00 - 0x1F <dd> control characters 3128 * </dl> 3129 * <p> 3130 * See EcmaScript 262 Section 15.12.1 for the full JSON grammar. 3131 */ mustEscapeCharInJsonString(int codepoint)3132 static boolean mustEscapeCharInJsonString(int codepoint) { 3133 return JSON_ESCAPE_CHARS.contains(codepoint); 3134 } 3135 3136 /** 3137 * Builds a small set of code points. 3138 * {@code com.google.common.base} cannot depend on ICU4J, thus avoiding ICU's 3139 * {@code UnicodeSet}. 3140 * For all other purposes, please use {@code com.ibm.icu.text.UnicodeSet}. 3141 */ 3142 private static class UnicodeSetBuilder { 3143 Set<Integer> codePointSet = new HashSet<Integer>(); 3144 addCodePoint(int c)3145 UnicodeSetBuilder addCodePoint(int c) { 3146 codePointSet.add(c); 3147 return this; 3148 } 3149 addRange(int from, int to)3150 UnicodeSetBuilder addRange(int from, int to) { 3151 for (int i = from; i <= to; i++) { 3152 codePointSet.add(i); 3153 } 3154 return this; 3155 } 3156 create()3157 Set<Integer> create() { 3158 return codePointSet; 3159 } 3160 } 3161 3162 private static final Set<Integer> JS_ESCAPE_CHARS = new UnicodeSetBuilder() 3163 // All characters in the class of format characters, [:Cf:]. 3164 // Source: http://unicode.org/cldr/utility/list-unicodeset.jsp. 3165 .addCodePoint(0xAD) 3166 .addRange(0x600, 0x603) 3167 .addCodePoint(0x6DD) 3168 .addCodePoint(0x070F) 3169 .addRange(0x17B4, 0x17B5) 3170 .addRange(0x200B, 0x200F) 3171 .addRange(0x202A, 0x202E) 3172 .addRange(0x2060, 0x2064) 3173 .addRange(0x206A, 0x206F) 3174 .addCodePoint(0xFEFF) 3175 .addRange(0xFFF9, 0xFFFB) 3176 .addRange(0x0001D173, 0x0001D17A) 3177 .addCodePoint(0x000E0001) 3178 .addRange(0x000E0020, 0x000E007F) 3179 // Plus characters mentioned in the docs of mustEscapeCharInJsString(). 3180 .addCodePoint(0x0000) 3181 .addCodePoint(0x000A) 3182 .addCodePoint(0x000D) 3183 .addRange(0x2028, 0x2029) 3184 .addCodePoint(0x0085) 3185 .addCodePoint(Character.codePointAt("'", 0)) 3186 .addCodePoint(Character.codePointAt("\"", 0)) 3187 .addCodePoint(Character.codePointAt("&", 0)) 3188 .addCodePoint(Character.codePointAt("<", 0)) 3189 .addCodePoint(Character.codePointAt(">", 0)) 3190 .addCodePoint(Character.codePointAt("=", 0)) 3191 .addCodePoint(Character.codePointAt("\\", 0)) 3192 .create(); 3193 3194 private static final Set<Integer> JSON_ESCAPE_CHARS = new UnicodeSetBuilder() 3195 .addCodePoint(Character.codePointAt("\"", 0)) 3196 .addCodePoint(Character.codePointAt("\\", 0)) 3197 .addRange(0x0000, 0x001F) 3198 .create(); 3199 3200 /** 3201 * <b>To be deprecated:</b> use {@link CharEscapers#xmlEscaper()} instead. 3202 */ xmlEscape(String s)3203 public static String xmlEscape(String s) { 3204 return CharEscapers.xmlEscaper().escape(s); 3205 } 3206 3207 /** 3208 * <b>To be deprecated:</b> use {@link CharEscapers#asciiHtmlEscaper()} instead. 3209 */ htmlEscape(String s)3210 public static String htmlEscape(String s) { 3211 return CharEscapers.asciiHtmlEscaper().escape(s); 3212 } 3213 }