• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2000, Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.android.mail.common.base;
18 
19 import static com.google.android.mail.common.base.Preconditions.checkArgument;
20 
21 import com.google.common.base.Joiner;
22 import com.google.common.base.Joiner.MapJoiner;
23 
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.StringWriter;
27 import java.util.ArrayList;
28 import java.util.Collection;
29 import java.util.Collections;
30 import java.util.HashMap;
31 import java.util.HashSet;
32 import java.util.Iterator;
33 import java.util.LinkedHashMap;
34 import java.util.LinkedList;
35 import java.util.List;
36 import java.util.Map;
37 import java.util.Set;
38 import java.util.StringTokenizer;
39 import java.util.regex.Matcher;
40 import java.util.regex.Pattern;
41 
42 /**
43  * Static utility methods and constants pertaining to {@code String} or {@code
44  * CharSequence} instances.
45  */
46 public final class StringUtil {
StringUtil()47   private StringUtil() {} // COV_NF_LINE
48 
49   /**
50    * A completely arbitrary selection of eight whitespace characters. See
51    * <a href="http://go/white+space">this spreadsheet</a> for more details
52    * about whitespace characters.
53    *
54    * @deprecated Rewrite your code to use {@link CharMatcher#WHITESPACE}, or
55    *     consider the precise set of characters you want to match and construct
56    *     the right explicit {@link CharMatcher} or {@link String} for your own
57    *     purposes.
58    */
59   @Deprecated
60   public static final String WHITE_SPACES = " \r\n\t\u3000\u00A0\u2007\u202F";
61 
62   /** A string containing the carriage return and linefeed characters. */
63   public static final String LINE_BREAKS = "\r\n";
64 
65   /**
66    * Old location of {@link Strings#isNullOrEmpty}; this method will be
67    * deprecated soon.
68    */
isEmpty(String string)69   public static boolean isEmpty(String string) {
70     return Strings.isNullOrEmpty(string);
71   }
72 
73   /**
74    * Returns {@code true} if the given string is null, empty, or comprises only
75    * whitespace characters, as defined by {@link CharMatcher#WHITESPACE}.
76    *
77    * <p><b>Warning:</b> there are many competing definitions of "whitespace";
78    * please see <a href="http://go/white+space">this spreadsheet</a> for
79    * details.
80    *
81    * @param string the string reference to check
82    * @return {@code true} if {@code string} is null, empty, or consists of
83    *     whitespace characters only
84    */
isEmptyOrWhitespace(String string)85   public static boolean isEmptyOrWhitespace(String string) {
86     return string == null || CharMatcher.WHITESPACE.matchesAllOf(string);
87   }
88 
89   /**
90    * Old location of {@link Strings#nullToEmpty}; this method will be
91    * deprecated soon.
92    */
makeSafe(String string)93   public static String makeSafe(String string) {
94     return Strings.nullToEmpty(string);
95   }
96 
97   /**
98    * Old location of {@link Strings#emptyToNull}; this method will be
99    * deprecated soon.
100    */
toNullIfEmpty(String string)101   public static String toNullIfEmpty(String string) {
102     return Strings.emptyToNull(string);
103   }
104 
105   /**
106    * Returns the given string if it is nonempty and contains at least one
107    * non-whitespace character; {@code null} otherwise. See comment in {@link
108    * #isEmptyOrWhitespace} on the definition of whitespace.
109    *
110    * @param string the string to test and possibly return
111    * @return {@code null} if {@code string} is null, empty, or contains only
112    *     whitespace characters; {@code string} itself otherwise
113    */
toNullIfEmptyOrWhitespace( String string)114   public static String toNullIfEmptyOrWhitespace(
115       String string) {
116     return isEmptyOrWhitespace(string) ? null : string;
117   }
118 
119   /**
120    * Old location of {@link Strings#repeat}; this method will be deprecated
121    * soon.
122    */
repeat(String string, int count)123   public static String repeat(String string, int count) {
124     return Strings.repeat(string, count);
125   }
126 
127   /**
128    * Return the first index in the string of any of the specified characters,
129    * starting at a given index, or {@code -1} if none of the characters is
130    * present.
131    *
132    * @param string the non-null character sequence to look in
133    * @param chars a non-null character sequence containing the set of characters
134    *     to look for. If empty, this method will find no matches and return
135    *     {@code -1}
136    * @param fromIndex the index of the first character to examine in the input
137    *     string. If negative, the entire string will be searched. If greater
138    *     than or equal to the string length, no characters will be searched and
139    *     {@code -1} will be returned.
140    * @return the index of the first match, or {@code -1} if no match was found.
141    *     Guaranteed to be either {@code -1} or a number greater than or equal to
142    *     {@code fromIndex}
143    * @throws NullPointerException if any argument is null
144    */
145   // author: pault
indexOfChars( CharSequence string, CharSequence chars, int fromIndex)146   public static int indexOfChars(
147       CharSequence string, CharSequence chars, int fromIndex) {
148     if (fromIndex >= string.length()) {
149       return -1;
150     }
151 
152     /*
153      * Prepare lookup structures for the characters. TODO(pault): This loop
154      * could be factored into another method to allow caching of the resulting
155      * struct if a use-case of very large character sets exists.
156      */
157     Set<Character> charSet = Collections.emptySet();
158     boolean[] charArray = new boolean[128];
159     for (int i = 0; i < chars.length(); i++) {
160       char c = chars.charAt(i);
161       if (c < 128) {
162         charArray[c] = true;
163       } else {
164         if (charSet.isEmpty()) {
165           charSet = new HashSet<Character>();
166         }
167         charSet.add(c);
168       }
169     }
170 
171     // Scan the string for matches
172     for (int i = Math.max(fromIndex, 0); i < string.length(); i++) {
173       char c = string.charAt(i);
174       if (c < 128) {
175         if (charArray[c]) {
176           return i;
177         }
178       } else if (charSet.contains(c)) {
179         return i;
180       }
181     }
182     return -1;
183   }
184 
185 /*
186  * -------------------------------------------------------------------
187  * This marks the end of the code that has been written or rewritten
188  * in 2008 to the quality standards of the Java core libraries group.
189  * Code below this point is still awaiting cleanup (you can help!).
190  * See http://wiki/Nonconf/JavaCoreLibrariesStandards.
191  * -------------------------------------------------------------------
192  */
193 
194 
195   /**
196    * @param str the string to split.  Must not be null.
197    * @param delims the delimiter characters. Each character in the
198    *        string is individually treated as a delimiter.
199    * @return an array of tokens. Will not return null. Individual tokens
200    *        do not have leading/trailing whitespace removed.
201    * @deprecated see the detailed instructions under
202    *     {@link #split(String, String, boolean)}
203    */
204   @Deprecated
split(String str, String delims)205   public static String[] split(String str, String delims) {
206     return split(str, delims, false);
207   }
208 
209   /**
210    * This method is deprecated because it is too inflexible, providing
211    * only a very specific set of behaviors that almost never matches exactly
212    * what you intend. Prefer using a {@link Splitter}, which is more flexible
213    * and consistent in the way it handles trimming and empty tokens.
214    *
215    * <ul>
216    * <li>Create a {@link Splitter} using {@link Splitter#on(CharMatcher)} such
217    *     as {@code Splitter.on(CharMatcher.anyOf(delims))}.
218    * <li><i>If</i> you need whitespace trimmed from the ends of each segment,
219    *     adding {@code .trimResults()} to your splitter definition should work
220    *     in most cases. To match the exact behavior of this method, use
221    *     {@code .trimResults(CharMatcher.inRange('\0', ' '))}.
222    * <li>This method silently ignores empty tokens in the input, but allows
223    *     empty tokens to appear in the output if {@code trimTokens} is
224    *     {@code true}. Adding {@code .omitEmptyStrings()} to your splitter
225    *     definition will filter empty tokens out but will do so <i>after</i>
226    *     having performed trimming. If you absolutely require this method's
227    *     behavior in this respect, Splitter is not able to match it.
228    * <li>If you need the result as an array, use {@link
229    *     com.google.common.collect.Iterables#toArray(Iterable, Class)} on the
230    *     {@code Iterable<String>} returned by {@link Splitter#split}.
231    * </ul>
232    *
233    * @param str the string to split.  Must not be null.
234    * @param delims the delimiter characters. Each character in the string
235    *        is individually treated as a delimiter.
236    * @param trimTokens if true, leading/trailing whitespace is removed
237    *        from the tokens
238    * @return an array of tokens. Will not return null.
239    * @deprecated
240    */
241   @Deprecated
split( String str, String delims, boolean trimTokens)242   public static String[] split(
243       String str, String delims, boolean trimTokens) {
244     StringTokenizer tokenizer = new StringTokenizer(str, delims);
245     int n = tokenizer.countTokens();
246     String[] list = new String[n];
247     for (int i = 0; i < n; i++) {
248       if (trimTokens) {
249         list[i] = tokenizer.nextToken().trim();
250       } else {
251         list[i] = tokenizer.nextToken();
252       }
253     }
254     return list;
255   }
256 
257   /**
258    * Trim characters from only the beginning of a string.
259    * This is a convenience method, it simply calls trimStart(s, null).
260    *
261    * @param s String to be trimmed
262    * @return String with whitespace characters removed from the beginning
263    */
trimStart(String s)264   public static String trimStart(String s) {
265     return trimStart(s, null);
266   }
267 
268   /**
269    * Trim characters from only the beginning of a string.
270    * This method will remove all whitespace characters
271    * (defined by Character.isWhitespace(char), in addition to the characters
272    * provided, from the end of the provided string.
273    *
274    * @param s String to be trimmed
275    * @param extraChars Characters in addition to whitespace characters that
276    *                   should be trimmed.  May be null.
277    * @return String with whitespace and characters in extraChars removed
278    *                   from the beginning
279    */
trimStart(String s, String extraChars)280   public static String trimStart(String s, String extraChars) {
281     int trimCount = 0;
282     while (trimCount < s.length()) {
283       char ch = s.charAt(trimCount);
284       if (Character.isWhitespace(ch)
285         || (extraChars != null && extraChars.indexOf(ch) >= 0)) {
286         trimCount++;
287       } else {
288         break;
289       }
290     }
291 
292     if (trimCount == 0) {
293       return s;
294     }
295     return s.substring(trimCount);
296   }
297 
298   /**
299    * Trim characters from only the end of a string.
300    * This is a convenience method, it simply calls trimEnd(s, null).
301    *
302    * @param s String to be trimmed
303    * @return String with whitespace characters removed from the end
304    */
trimEnd(String s)305   public static String trimEnd(String s) {
306     return trimEnd(s, null);
307   }
308 
309   /**
310    * Trim characters from only the end of a string.
311    * This method will remove all whitespace characters
312    * (defined by Character.isWhitespace(char), in addition to the characters
313    * provided, from the end of the provided string.
314    *
315    * @param s String to be trimmed
316    * @param extraChars Characters in addition to whitespace characters that
317    *                   should be trimmed.  May be null.
318    * @return String with whitespace and characters in extraChars removed
319    *                   from the end
320    */
trimEnd(String s, String extraChars)321   public static String trimEnd(String s, String extraChars) {
322     int trimCount = 0;
323     while (trimCount < s.length()) {
324       char ch = s.charAt(s.length() - trimCount - 1);
325       if (Character.isWhitespace(ch)
326         || (extraChars != null && extraChars.indexOf(ch) >= 0)) {
327         trimCount++;
328       } else {
329         break;
330       }
331     }
332 
333     if (trimCount == 0) {
334       return s;
335     }
336     return s.substring(0, s.length() - trimCount);
337   }
338 
339   /**
340    * @param str the string to split.  Must not be null.
341    * @param delims the delimiter characters. Each character in the
342    *        string is individually treated as a delimiter.
343    * @return an array of tokens. Will not return null. Leading/trailing
344    *        whitespace is removed from the tokens.
345    * @deprecated see the detailed instructions under
346    *     {@link #split(String, String, boolean)}
347    */
348   @Deprecated
splitAndTrim(String str, String delims)349   public static String[] splitAndTrim(String str, String delims) {
350     return split(str, delims, true);
351   }
352 
353   /** Parse comma-separated list of ints and return as array. */
splitInts(String str)354   public static int[] splitInts(String str) throws IllegalArgumentException {
355     StringTokenizer tokenizer = new StringTokenizer(str, ",");
356     int n = tokenizer.countTokens();
357     int[] list = new int[n];
358     for (int i = 0; i < n; i++) {
359       String token = tokenizer.nextToken();
360       list[i] = Integer.parseInt(token);
361     }
362     return list;
363   }
364 
365   /** Parse comma-separated list of longs and return as array. */
splitLongs(String str)366   public static long[] splitLongs(String str) throws IllegalArgumentException {
367     StringTokenizer tokenizer = new StringTokenizer(str, ",");
368     int n = tokenizer.countTokens();
369     long[] list = new long[n];
370     for (int i = 0; i < n; i++) {
371       String token = tokenizer.nextToken();
372       list[i] = Long.parseLong(token);
373     }
374     return list;
375   }
376 
377   /** This replaces the occurrences of 'what' in 'str' with 'with'
378    *
379    * @param str the string to process
380    * @param what to replace
381    * @param with replace with this
382    * @return String str where 'what' was replaced with 'with'
383    *
384    * @deprecated Please use {@link String#replace(CharSequence, CharSequence)}.
385    */
386   @Deprecated
replace( String str, CharSequence what, CharSequence with)387   public static String replace(
388       String str, CharSequence what, CharSequence with) {
389     // Have to check this argument, for compatibility with the old impl.
390     // For the record, String.replace() is capable of handling an empty target
391     // string... but it does something kind of weird in that case.
392     checkArgument(what.length() > 0);
393     return str.replace(what, with);
394   }
395 
396   private static final Splitter NEWLINE_SPLITTER =
397       Splitter.on('\n').omitEmptyStrings();
398 
399   /**
400    * Reformats the given string to a fixed width by inserting carriage returns
401    * and trimming unnecessary whitespace. See
402    * {@link #fixedWidth(String[], int)} for details. The {@code str} argument
403    * to this method will be split on newline characters ({@code '\n'}) only
404    * (regardless of platform).  An array of resulting non-empty strings is
405    * then passed to {@link #fixedWidth(String[], int)} as the {@code lines}
406    * parameter.
407    *
408    * @param str the string to format
409    * @param width the fixed width (in characters)
410    */
fixedWidth(String str, int width)411   public static String fixedWidth(String str, int width) {
412     List<String> lines = new ArrayList<String>();
413 
414     for (String line : NEWLINE_SPLITTER.split(str)) {
415       lines.add(line);
416     }
417 
418     String[] lineArray = lines.toArray(new String[0]);
419     return fixedWidth(lineArray, width);
420   }
421 
422   /**
423    * Reformats the given array of lines to a fixed width by inserting
424    * newlines and trimming unnecessary whitespace.  This uses simple
425    * whitespace-based splitting, not sophisticated internationalized
426    * line breaking.  Newlines within a line are treated like any other
427    * whitespace.  Lines which are already short enough will be passed
428    * through unmodified.
429    *
430    * <p>Only breaking whitespace characters (those which match
431    * {@link CharMatcher#BREAKING_WHITESPACE}) are treated as whitespace by
432    * this method. Non-breaking whitespace characters will be considered as
433    * ordinary characters which are connected to any other adjacent
434    * non-whitespace characters, and will therefore appear in the returned
435    * string in their original context.
436    *
437    * @param lines array of lines to format
438    * @param width the fixed width (in characters)
439    */
fixedWidth(String[] lines, int width)440   public static String fixedWidth(String[] lines, int width) {
441     List<String> formattedLines = new ArrayList<String>();
442 
443     for (String line : lines) {
444       formattedLines.add(formatLineToFixedWidth(line, width));
445     }
446 
447     return Joiner.on('\n').join(formattedLines);
448   }
449 
450   private static final Splitter TO_WORDS =
451       Splitter.on(CharMatcher.BREAKING_WHITESPACE).omitEmptyStrings();
452 
453   /**
454    * Helper method for {@link #fixedWidth(String[], int)}
455    */
formatLineToFixedWidth(String line, int width)456   private static String formatLineToFixedWidth(String line, int width) {
457     if (line.length() <= width) {
458       return line;
459     }
460 
461     StringBuilder builder = new StringBuilder();
462     int col = 0;
463 
464     for (String word : TO_WORDS.split(line)) {
465       if (col == 0) {
466         col = word.length();
467       } else {
468         int newCol = col + word.length() + 1;  // +1 for the space
469 
470         if (newCol <= width) {
471           builder.append(' ');
472           col = newCol;
473         } else {
474           builder.append('\n');
475           col = word.length();
476         }
477       }
478 
479       builder.append(word);
480     }
481 
482     return builder.toString();
483   }
484 
485   /**
486    * Splits the argument original into a list of substrings.  All the
487    * substrings in the returned list (except possibly the last) will
488    * have length lineLen.
489    *
490    * @param lineLen  the length of the substrings to put in the list
491    * @param original the original string
492    *
493    * @return a list of strings of length lineLen that together make up the
494    *     original string
495    * @deprecated use {@code Splitter.fixedLength(lineLen).split(original))}
496    *     (note that it returns an {@code Iterable}, not a {@code List})
497    */
498   @Deprecated
fixedSplit(String original, int lineLen)499   public static List<String> fixedSplit(String original, int lineLen) {
500     List<String> output = new ArrayList<String>();
501     for (String elem : Splitter.fixedLength(lineLen).split(original)) {
502       output.add(elem);
503     }
504     return output;
505   }
506 
507   /**
508    * Indents the given String per line.
509    * @param iString the string to indent
510    * @param iIndentDepth the depth of the indentation
511    * @return the indented string
512    */
indent(String iString, int iIndentDepth)513   public static String indent(String iString, int iIndentDepth) {
514     StringBuilder spacer = new StringBuilder();
515     spacer.append("\n");
516     for (int i = 0; i < iIndentDepth; i++) {
517       spacer.append("  ");
518     }
519     return iString.replace("\n", spacer.toString());
520   }
521 
522   /**
523    * This is a both way strip.
524    *
525    * @param str the string to strip
526    * @param left strip from left
527    * @param right strip from right
528    * @param what character(s) to strip
529    * @return the stripped string
530    * @deprecated ensure the string is not null and use
531    *  <ul>
532    *    <li> {@code CharMatcher.anyOf(what).trimFrom(str)}
533    *        if {@code left == true} and {@code right == true}
534    *    <li> {@code CharMatcher.anyOf(what).trimLeadingFrom(str)}
535    *        if {@code left == true} and {@code right == false}
536    *    <li> {@code CharMatcher.anyOf(what).trimTrailingFrom(str)}
537    *        if {@code left == false} and {@code right == true}
538    *  </ul>
539    */
540   @Deprecated
megastrip(String str, boolean left, boolean right, String what)541   public static String megastrip(String str,
542                                  boolean left, boolean right,
543                                  String what) {
544     if (str == null) {
545       return null;
546     }
547 
548     CharMatcher matcher = CharMatcher.anyOf(what);
549     if (left) {
550       if (right) {
551         return matcher.trimFrom(str);
552       }
553       return matcher.trimLeadingFrom(str);
554     }
555     if (right) {
556       return matcher.trimTrailingFrom(str);
557     }
558     return str;
559   }
560 
561   /** strip - strips both ways
562    *
563    * @param str what to strip
564    * @return String the striped string
565    * @deprecated ensure the string is not null and use {@code
566    *     CharMatcher.LEGACY_WHITESPACE.trimFrom(str)}; also consider whether you
567    *     really want the legacy whitespace definition, or something more
568    *     standard like {@link CharMatcher#WHITESPACE}.
569    */
570   @SuppressWarnings("deprecation") // this is deprecated itself
strip(String str)571   @Deprecated public static String strip(String str) {
572     return (str == null) ? null : CharMatcher.LEGACY_WHITESPACE.trimFrom(str);
573   }
574 
575   /** Strip white spaces from both end, and collapse white spaces
576    * in the middle.
577    *
578    * @param str what to strip
579    * @return String the striped and collapsed string
580    * @deprecated ensure the string is not null and use {@code
581    *     CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ')}; also
582    *     consider whether you really want the legacy whitespace definition, or
583    *     something more standard like {@link CharMatcher#WHITESPACE}.
584    */
585   @SuppressWarnings("deprecation") // this is deprecated itself
stripAndCollapse(String str)586   @Deprecated public static String stripAndCollapse(String str) {
587     return (str == null) ? null
588         : CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ');
589   }
590 
591   /**
592    * Give me a string and a potential prefix, and I return the string
593    * following the prefix if the prefix matches, else null.
594    * Analogous to the c++ functions strprefix and var_strprefix.
595    *
596    * @param str the string to strip
597    * @param prefix the expected prefix
598    * @return the stripped string or <code>null</code> if the string
599    * does not start with the prefix
600    */
stripPrefix(String str, String prefix)601   public static String stripPrefix(String str, String prefix) {
602     return str.startsWith(prefix)
603         ? str.substring(prefix.length())
604         : null;
605   }
606 
607   /**
608    * Case insensitive version of stripPrefix. Strings are compared in
609    * the same way as in {@link String#equalsIgnoreCase}.
610    * Analogous to the c++ functions strcaseprefix and var_strcaseprefix.
611    *
612    * @param str the string to strip
613    * @param prefix the expected prefix
614    * @return the stripped string or <code>null</code> if the string
615    * does not start with the prefix
616    */
stripPrefixIgnoreCase(String str, String prefix)617   public static String stripPrefixIgnoreCase(String str, String prefix) {
618     return startsWithIgnoreCase(str, prefix)
619         ? str.substring(prefix.length())
620         : null;
621   }
622 
623   /**
624    * Give me a string and a potential suffix, and I return the string
625    * before the suffix if the suffix matches, else null.
626    * Analogous to the c++ function strsuffix.
627    *
628    * @param str the string to strip
629    * @param suffix the expected suffix
630    * @return the stripped string or <code>null</code> if the string
631    * does not end with the suffix
632    */
stripSuffix(String str, String suffix)633   public static String stripSuffix(String str, String suffix) {
634     return str.endsWith(suffix)
635         ? str.substring(0, str.length() - suffix.length())
636         : null;
637   }
638 
639   /**
640    * Case insensitive version of stripSuffix. Strings are compared in
641    * the same way as in {@link String#equalsIgnoreCase}.
642    * Analogous to the c++ function strcasesuffix.
643    *
644    * @param str the string to strip
645    * @param suffix the expected suffix
646    * @return the stripped string or <code>null</code> if the string
647    * does not end with the suffix
648    */
stripSuffixIgnoreCase( String str, String suffix)649   public static String stripSuffixIgnoreCase(
650       String str, String suffix) {
651     return endsWithIgnoreCase(str, suffix)
652         ? str.substring(0, str.length() - suffix.length())
653         : null;
654   }
655 
656   /**
657    * Strips all non-digit characters from a string.
658    *
659    * The resulting string will only contain characters for which isDigit()
660    * returns true.
661    *
662    * @param str the string to strip
663    * @return a string consisting of digits only, or an empty string
664    * @deprecated use {@code CharMatcher.JAVA_DIGIT.retainFrom(str)} (also
665    *     consider whether this is really the definition of "digit" you wish to
666    *     use)
667    */
stripNonDigits(String str)668   @Deprecated public static String stripNonDigits(String str) {
669     return CharMatcher.JAVA_DIGIT.retainFrom(str);
670   }
671 
672   /**
673    * Finds the last index in str of a character not in the characters
674    * in 'chars' (similar to ANSI string.find_last_not_of).
675    *
676    * Returns -1 if no such character can be found.
677    *
678    * <p><b>Note:</b> If {@code fromIndex} is zero, use {@link CharMatcher}
679    * instead for this: {@code CharMatcher.noneOf(chars).lastIndexIn(str)}.
680    */
681   // TODO(kevinb): after adding fromIndex versions of (last)IndexOf to
682   // CharMatcher, deprecate this
lastIndexNotOf(String str, String chars, int fromIndex)683   public static int lastIndexNotOf(String str, String chars, int fromIndex) {
684     fromIndex = Math.min(fromIndex, str.length() - 1);
685 
686     for (int pos = fromIndex; pos >= 0; pos--) {
687       if (chars.indexOf(str.charAt(pos)) < 0) {
688         return pos;
689       }
690     }
691 
692     return -1;
693   }
694 
695   /**
696    * Like String.replace() except that it accepts any number of old chars.
697    * Replaces any occurrances of 'oldchars' in 'str' with 'newchar'.
698    * Example: replaceChars("Hello, world!", "H,!", ' ') returns " ello  world "
699    *
700    * @deprecated use {@code CharMatcher#replaceFrom(String, char)}, for example
701    *     {@code CharMatcher.anyOf(oldchars).replaceFrom(str, newchar)}
702    */
replaceChars( String str, CharSequence oldchars, char newchar)703   @Deprecated public static String replaceChars(
704       String str, CharSequence oldchars, char newchar) {
705     return CharMatcher.anyOf(oldchars).replaceFrom(str, newchar);
706   }
707 
708   /**
709    * Remove any occurrances of 'oldchars' in 'str'.
710    * Example: removeChars("Hello, world!", ",!") returns "Hello world"
711    *
712    * @deprecated use {@link CharMatcher#removeFrom(CharSequence)}, for example
713    *     {@code CharMatcher.anyOf(oldchars).removeFrom(str)}
714    */
removeChars( String str, CharSequence oldchars)715   @Deprecated public static String removeChars(
716       String str, CharSequence oldchars) {
717     return CharMatcher.anyOf(oldchars).removeFrom(str);
718   }
719 
720   // See http://www.microsoft.com/typography/unicode/1252.htm
721   private static final CharMatcher FANCY_SINGLE_QUOTE
722       = CharMatcher.anyOf("\u0091\u0092\u2018\u2019");
723   private static final CharMatcher FANCY_DOUBLE_QUOTE
724       = CharMatcher.anyOf("\u0093\u0094\u201c\u201d");
725 
726   /**
727    * Replaces microsoft "smart quotes" (curly " and ') with their
728    * ascii counterparts.
729    */
replaceSmartQuotes(String str)730   public static String replaceSmartQuotes(String str) {
731     String tmp = FANCY_SINGLE_QUOTE.replaceFrom(str, '\'');
732     return FANCY_DOUBLE_QUOTE.replaceFrom(tmp, '"');
733   }
734 
735   /**
736    * Convert a string of hex digits to a byte array, with the first
737    * byte in the array being the MSB. The string passed in should be
738    * just the raw digits (upper or lower case), with no leading
739    * or trailing characters (like '0x' or 'h').
740    * An odd number of characters is supported.
741    * If the string is empty, an empty array will be returned.
742    *
743    * This is significantly faster than using
744    *   new BigInteger(str, 16).toByteArray();
745    * especially with larger strings. Here are the results of some
746    * microbenchmarks done on a P4 2.8GHz 2GB RAM running
747    * linux 2.4.22-gg11 and JDK 1.5 with an optimized build:
748    *
749    * String length        hexToBytes (usec)   BigInteger
750    * -----------------------------------------------------
751    * 16                       0.570                 1.43
752    * 256                      8.21                 44.4
753    * 1024                    32.8                 526
754    * 16384                  546                121000
755    */
hexToBytes(CharSequence str)756   public static byte[] hexToBytes(CharSequence str) {
757     byte[] bytes = new byte[(str.length() + 1) / 2];
758     if (str.length() == 0) {
759       return bytes;
760     }
761     bytes[0] = 0;
762     int nibbleIdx = (str.length() % 2);
763     for (int i = 0; i < str.length(); i++) {
764       char c = str.charAt(i);
765       if (!isHex(c)) {
766         throw new IllegalArgumentException("string contains non-hex chars");
767       }
768       if ((nibbleIdx % 2) == 0) {
769         bytes[nibbleIdx >> 1] = (byte) (hexValue(c) << 4);
770       } else {
771         bytes[nibbleIdx >> 1] += (byte) hexValue(c);
772       }
773       nibbleIdx++;
774     }
775     return bytes;
776   }
777 
778   /**
779    * Converts any instances of "\r" or "\r\n" style EOLs into "\n" (Line Feed).
780    */
convertEOLToLF(String input)781   public static String convertEOLToLF(String input) {
782     StringBuilder res = new StringBuilder(input.length());
783     char[] s = input.toCharArray();
784     int from = 0;
785     final int end = s.length;
786     for (int i = 0; i < end; i++) {
787       if (s[i] == '\r') {
788         res.append(s, from, i - from);
789         res.append('\n');
790         if (i + 1 < end && s[i + 1] == '\n') {
791           i++;
792         }
793 
794         from = i + 1;
795       }
796     }
797 
798     if (from == 0) {   // no \r!
799       return input;
800     }
801 
802     res.append(s, from, end - from);
803     return res.toString();
804   }
805 
806   /**
807    * Old location of {@link Strings#padStart}; this method will be deprecated
808    * soon.
809    */
padLeft(String s, int len, char padChar)810   public static String padLeft(String s, int len, char padChar) {
811     return Strings.padStart(s, len, padChar);
812   }
813 
814   /**
815    * Old location of {@link Strings#padEnd}; this method will be deprecated
816    * soon.
817    */
padRight(String s, int len, char padChar)818   public static String padRight(String s, int len, char padChar) {
819     return Strings.padEnd(s, len, padChar);
820   }
821 
822   /**
823    * Returns a string consisting of "s", with each of the first "len" characters
824    * replaced by "maskChar" character.
825    */
maskLeft(String s, int len, char maskChar)826   public static String maskLeft(String s, int len, char maskChar) {
827     if (len <= 0) {
828       return s;
829     }
830     len = Math.min(len, s.length());
831     StringBuilder sb = new StringBuilder();
832     for (int i = 0; i < len; i++) {
833       sb.append(maskChar);
834     }
835     sb.append(s.substring(len));
836     return sb.toString();
837   }
838 
isOctal(char c)839   private static boolean isOctal(char c) {
840     return (c >= '0') && (c <= '7');
841   }
842 
isHex(char c)843   private static boolean isHex(char c) {
844     return ((c >= '0') && (c <= '9')) ||
845            ((c >= 'a') && (c <= 'f')) ||
846            ((c >= 'A') && (c <= 'F'));
847   }
848 
hexValue(char c)849   private static int hexValue(char c) {
850     if ((c >= '0') && (c <= '9')) {
851       return (c - '0');
852     } else if ((c >= 'a') && (c <= 'f')) {
853       return (c - 'a') + 10;
854     } else {
855       return (c - 'A') + 10;
856     }
857   }
858 
859   /**
860    * Unescape any C escape sequences (\n, \r, \\, \ooo, etc) and return the
861    * resulting string.
862    */
unescapeCString(String s)863   public static String unescapeCString(String s) {
864     if (s.indexOf('\\') < 0) {
865       // Fast path: nothing to unescape
866       return s;
867     }
868 
869     StringBuilder sb = new StringBuilder();
870     int len = s.length();
871     for (int i = 0; i < len;) {
872       char c = s.charAt(i++);
873       if (c == '\\' && (i < len)) {
874         c = s.charAt(i++);
875         switch (c) {
876           case 'a':  c = '\007';  break;
877           case 'b':  c = '\b';    break;
878           case 'f':  c = '\f';    break;
879           case 'n':  c = '\n';    break;
880           case 'r':  c = '\r';    break;
881           case 't':  c = '\t';    break;
882           case 'v':  c = '\013';  break;
883           case '\\': c = '\\';    break;
884           case '?':  c = '?';     break;
885           case '\'': c = '\'';    break;
886           case '"':  c = '\"';    break;
887 
888           default: {
889             if ((c == 'x') && (i < len) && isHex(s.charAt(i))) {
890               // "\xXX"
891               int v = hexValue(s.charAt(i++));
892               if ((i < len) && isHex(s.charAt(i))) {
893                 v = v * 16 + hexValue(s.charAt(i++));
894               }
895               c = (char) v;
896             } else if (isOctal(c)) {
897               // "\OOO"
898               int v = (c - '0');
899               if ((i < len) && isOctal(s.charAt(i))) {
900                 v = v * 8 + (s.charAt(i++) - '0');
901               }
902               if ((i < len) && isOctal(s.charAt(i))) {
903                 v = v * 8 + (s.charAt(i++) - '0');
904               }
905               c = (char) v;
906             } else {
907               // Propagate unknown escape sequences.
908               sb.append('\\');
909             }
910             break;
911           }
912         }
913       }
914       sb.append(c);
915     }
916     return sb.toString();
917   }
918 
919   /**
920    * Unescape any MySQL escape sequences.
921    * See MySQL language reference Chapter 6 at
922    * <a href="http://www.mysql.com/doc/">http://www.mysql.com/doc/</a>.
923    * This function will <strong>not</strong> work for other SQL-like
924    * dialects.
925    * @param s string to unescape, with the surrounding quotes.
926    * @return unescaped string, without the surrounding quotes.
927    * @exception IllegalArgumentException if s is not a valid MySQL string.
928    */
unescapeMySQLString(String s)929   public static String unescapeMySQLString(String s)
930       throws IllegalArgumentException {
931     // note: the same buffer is used for both reading and writing
932     // it works because the writer can never outrun the reader
933     char chars[] = s.toCharArray();
934 
935     // the string must be quoted 'like this' or "like this"
936     if (chars.length < 2 || chars[0] != chars[chars.length - 1] ||
937         (chars[0] != '\'' && chars[0] != '"')) {
938       throw new IllegalArgumentException("not a valid MySQL string: " + s);
939     }
940 
941     // parse the string and decode the backslash sequences; in addition,
942     // quotes can be escaped 'like this: ''', "like this: """, or 'like this: "'
943     int j = 1;  // write position in the string (never exceeds read position)
944     int f = 0;  // state: 0 (normal), 1 (backslash), 2 (quote)
945     for (int i = 1; i < chars.length - 1; i++) {
946       if (f == 0) {             // previous character was normal
947         if (chars[i] == '\\') {
948           f = 1;  // backslash
949         } else if (chars[i] == chars[0]) {
950           f = 2;  // quoting character
951         } else {
952           chars[j++] = chars[i];
953         }
954       } else if (f == 1) {      // previous character was a backslash
955         switch (chars[i]) {
956           case '0':   chars[j++] = '\0';   break;
957           case '\'':  chars[j++] = '\'';   break;
958           case '"':   chars[j++] = '"';    break;
959           case 'b':   chars[j++] = '\b';   break;
960           case 'n':   chars[j++] = '\n';   break;
961           case 'r':   chars[j++] = '\r';   break;
962           case 't':   chars[j++] = '\t';   break;
963           case 'z':   chars[j++] = '\032'; break;
964           case '\\':  chars[j++] = '\\';   break;
965           default:
966             // if the character is not special, backslash disappears
967             chars[j++] = chars[i];
968             break;
969         }
970         f = 0;
971       } else {                  // previous character was a quote
972         // quoting characters must be doubled inside a string
973         if (chars[i] != chars[0]) {
974           throw new IllegalArgumentException("not a valid MySQL string: " + s);
975         }
976         chars[j++] = chars[0];
977         f = 0;
978       }
979     }
980     // string contents cannot end with a special character
981     if (f != 0) {
982       throw new IllegalArgumentException("not a valid MySQL string: " + s);
983     }
984 
985     // done
986     return new String(chars, 1, j - 1);
987   }
988 
989   // TODO(pbarry): move all HTML methods to common.html package
990 
991   static final Map<String, Character> ESCAPE_STRINGS;
992   static final Set<Character> HEX_LETTERS;
993 
994   static {
995     // HTML character entity references as defined in HTML 4
996     // see http://www.w3.org/TR/REC-html40/sgml/entities.html
997     ESCAPE_STRINGS = new HashMap<String, Character>(252);
998 
999     ESCAPE_STRINGS.put("&nbsp", '\u00A0');
1000     ESCAPE_STRINGS.put("&iexcl", '\u00A1');
1001     ESCAPE_STRINGS.put("&cent", '\u00A2');
1002     ESCAPE_STRINGS.put("&pound", '\u00A3');
1003     ESCAPE_STRINGS.put("&curren", '\u00A4');
1004     ESCAPE_STRINGS.put("&yen", '\u00A5');
1005     ESCAPE_STRINGS.put("&brvbar", '\u00A6');
1006     ESCAPE_STRINGS.put("&sect", '\u00A7');
1007     ESCAPE_STRINGS.put("&uml", '\u00A8');
1008     ESCAPE_STRINGS.put("&copy", '\u00A9');
1009     ESCAPE_STRINGS.put("&ordf", '\u00AA');
1010     ESCAPE_STRINGS.put("&laquo", '\u00AB');
1011     ESCAPE_STRINGS.put("&not", '\u00AC');
1012     ESCAPE_STRINGS.put("&shy", '\u00AD');
1013     ESCAPE_STRINGS.put("&reg", '\u00AE');
1014     ESCAPE_STRINGS.put("&macr", '\u00AF');
1015     ESCAPE_STRINGS.put("&deg", '\u00B0');
1016     ESCAPE_STRINGS.put("&plusmn", '\u00B1');
1017     ESCAPE_STRINGS.put("&sup2", '\u00B2');
1018     ESCAPE_STRINGS.put("&sup3", '\u00B3');
1019     ESCAPE_STRINGS.put("&acute", '\u00B4');
1020     ESCAPE_STRINGS.put("&micro", '\u00B5');
1021     ESCAPE_STRINGS.put("&para", '\u00B6');
1022     ESCAPE_STRINGS.put("&middot", '\u00B7');
1023     ESCAPE_STRINGS.put("&cedil", '\u00B8');
1024     ESCAPE_STRINGS.put("&sup1", '\u00B9');
1025     ESCAPE_STRINGS.put("&ordm", '\u00BA');
1026     ESCAPE_STRINGS.put("&raquo", '\u00BB');
1027     ESCAPE_STRINGS.put("&frac14", '\u00BC');
1028     ESCAPE_STRINGS.put("&frac12", '\u00BD');
1029     ESCAPE_STRINGS.put("&frac34", '\u00BE');
1030     ESCAPE_STRINGS.put("&iquest", '\u00BF');
1031     ESCAPE_STRINGS.put("&Agrave", '\u00C0');
1032     ESCAPE_STRINGS.put("&Aacute", '\u00C1');
1033     ESCAPE_STRINGS.put("&Acirc", '\u00C2');
1034     ESCAPE_STRINGS.put("&Atilde", '\u00C3');
1035     ESCAPE_STRINGS.put("&Auml", '\u00C4');
1036     ESCAPE_STRINGS.put("&Aring", '\u00C5');
1037     ESCAPE_STRINGS.put("&AElig", '\u00C6');
1038     ESCAPE_STRINGS.put("&Ccedil", '\u00C7');
1039     ESCAPE_STRINGS.put("&Egrave", '\u00C8');
1040     ESCAPE_STRINGS.put("&Eacute", '\u00C9');
1041     ESCAPE_STRINGS.put("&Ecirc", '\u00CA');
1042     ESCAPE_STRINGS.put("&Euml", '\u00CB');
1043     ESCAPE_STRINGS.put("&Igrave", '\u00CC');
1044     ESCAPE_STRINGS.put("&Iacute", '\u00CD');
1045     ESCAPE_STRINGS.put("&Icirc", '\u00CE');
1046     ESCAPE_STRINGS.put("&Iuml", '\u00CF');
1047     ESCAPE_STRINGS.put("&ETH", '\u00D0');
1048     ESCAPE_STRINGS.put("&Ntilde", '\u00D1');
1049     ESCAPE_STRINGS.put("&Ograve", '\u00D2');
1050     ESCAPE_STRINGS.put("&Oacute", '\u00D3');
1051     ESCAPE_STRINGS.put("&Ocirc", '\u00D4');
1052     ESCAPE_STRINGS.put("&Otilde", '\u00D5');
1053     ESCAPE_STRINGS.put("&Ouml", '\u00D6');
1054     ESCAPE_STRINGS.put("&times", '\u00D7');
1055     ESCAPE_STRINGS.put("&Oslash", '\u00D8');
1056     ESCAPE_STRINGS.put("&Ugrave", '\u00D9');
1057     ESCAPE_STRINGS.put("&Uacute", '\u00DA');
1058     ESCAPE_STRINGS.put("&Ucirc", '\u00DB');
1059     ESCAPE_STRINGS.put("&Uuml", '\u00DC');
1060     ESCAPE_STRINGS.put("&Yacute", '\u00DD');
1061     ESCAPE_STRINGS.put("&THORN", '\u00DE');
1062     ESCAPE_STRINGS.put("&szlig", '\u00DF');
1063     ESCAPE_STRINGS.put("&agrave", '\u00E0');
1064     ESCAPE_STRINGS.put("&aacute", '\u00E1');
1065     ESCAPE_STRINGS.put("&acirc", '\u00E2');
1066     ESCAPE_STRINGS.put("&atilde", '\u00E3');
1067     ESCAPE_STRINGS.put("&auml", '\u00E4');
1068     ESCAPE_STRINGS.put("&aring", '\u00E5');
1069     ESCAPE_STRINGS.put("&aelig", '\u00E6');
1070     ESCAPE_STRINGS.put("&ccedil", '\u00E7');
1071     ESCAPE_STRINGS.put("&egrave", '\u00E8');
1072     ESCAPE_STRINGS.put("&eacute", '\u00E9');
1073     ESCAPE_STRINGS.put("&ecirc", '\u00EA');
1074     ESCAPE_STRINGS.put("&euml", '\u00EB');
1075     ESCAPE_STRINGS.put("&igrave", '\u00EC');
1076     ESCAPE_STRINGS.put("&iacute", '\u00ED');
1077     ESCAPE_STRINGS.put("&icirc", '\u00EE');
1078     ESCAPE_STRINGS.put("&iuml", '\u00EF');
1079     ESCAPE_STRINGS.put("&eth", '\u00F0');
1080     ESCAPE_STRINGS.put("&ntilde", '\u00F1');
1081     ESCAPE_STRINGS.put("&ograve", '\u00F2');
1082     ESCAPE_STRINGS.put("&oacute", '\u00F3');
1083     ESCAPE_STRINGS.put("&ocirc", '\u00F4');
1084     ESCAPE_STRINGS.put("&otilde", '\u00F5');
1085     ESCAPE_STRINGS.put("&ouml", '\u00F6');
1086     ESCAPE_STRINGS.put("&divide", '\u00F7');
1087     ESCAPE_STRINGS.put("&oslash", '\u00F8');
1088     ESCAPE_STRINGS.put("&ugrave", '\u00F9');
1089     ESCAPE_STRINGS.put("&uacute", '\u00FA');
1090     ESCAPE_STRINGS.put("&ucirc", '\u00FB');
1091     ESCAPE_STRINGS.put("&uuml", '\u00FC');
1092     ESCAPE_STRINGS.put("&yacute", '\u00FD');
1093     ESCAPE_STRINGS.put("&thorn", '\u00FE');
1094     ESCAPE_STRINGS.put("&yuml", '\u00FF');
1095     ESCAPE_STRINGS.put("&fnof", '\u0192');
1096     ESCAPE_STRINGS.put("&Alpha", '\u0391');
1097     ESCAPE_STRINGS.put("&Beta", '\u0392');
1098     ESCAPE_STRINGS.put("&Gamma", '\u0393');
1099     ESCAPE_STRINGS.put("&Delta", '\u0394');
1100     ESCAPE_STRINGS.put("&Epsilon", '\u0395');
1101     ESCAPE_STRINGS.put("&Zeta", '\u0396');
1102     ESCAPE_STRINGS.put("&Eta", '\u0397');
1103     ESCAPE_STRINGS.put("&Theta", '\u0398');
1104     ESCAPE_STRINGS.put("&Iota", '\u0399');
1105     ESCAPE_STRINGS.put("&Kappa", '\u039A');
1106     ESCAPE_STRINGS.put("&Lambda", '\u039B');
1107     ESCAPE_STRINGS.put("&Mu", '\u039C');
1108     ESCAPE_STRINGS.put("&Nu", '\u039D');
1109     ESCAPE_STRINGS.put("&Xi", '\u039E');
1110     ESCAPE_STRINGS.put("&Omicron", '\u039F');
1111     ESCAPE_STRINGS.put("&Pi", '\u03A0');
1112     ESCAPE_STRINGS.put("&Rho", '\u03A1');
1113     ESCAPE_STRINGS.put("&Sigma", '\u03A3');
1114     ESCAPE_STRINGS.put("&Tau", '\u03A4');
1115     ESCAPE_STRINGS.put("&Upsilon", '\u03A5');
1116     ESCAPE_STRINGS.put("&Phi", '\u03A6');
1117     ESCAPE_STRINGS.put("&Chi", '\u03A7');
1118     ESCAPE_STRINGS.put("&Psi", '\u03A8');
1119     ESCAPE_STRINGS.put("&Omega", '\u03A9');
1120     ESCAPE_STRINGS.put("&alpha", '\u03B1');
1121     ESCAPE_STRINGS.put("&beta", '\u03B2');
1122     ESCAPE_STRINGS.put("&gamma", '\u03B3');
1123     ESCAPE_STRINGS.put("&delta", '\u03B4');
1124     ESCAPE_STRINGS.put("&epsilon", '\u03B5');
1125     ESCAPE_STRINGS.put("&zeta", '\u03B6');
1126     ESCAPE_STRINGS.put("&eta", '\u03B7');
1127     ESCAPE_STRINGS.put("&theta", '\u03B8');
1128     ESCAPE_STRINGS.put("&iota", '\u03B9');
1129     ESCAPE_STRINGS.put("&kappa", '\u03BA');
1130     ESCAPE_STRINGS.put("&lambda", '\u03BB');
1131     ESCAPE_STRINGS.put("&mu", '\u03BC');
1132     ESCAPE_STRINGS.put("&nu", '\u03BD');
1133     ESCAPE_STRINGS.put("&xi", '\u03BE');
1134     ESCAPE_STRINGS.put("&omicron", '\u03BF');
1135     ESCAPE_STRINGS.put("&pi", '\u03C0');
1136     ESCAPE_STRINGS.put("&rho", '\u03C1');
1137     ESCAPE_STRINGS.put("&sigmaf", '\u03C2');
1138     ESCAPE_STRINGS.put("&sigma", '\u03C3');
1139     ESCAPE_STRINGS.put("&tau", '\u03C4');
1140     ESCAPE_STRINGS.put("&upsilon", '\u03C5');
1141     ESCAPE_STRINGS.put("&phi", '\u03C6');
1142     ESCAPE_STRINGS.put("&chi", '\u03C7');
1143     ESCAPE_STRINGS.put("&psi", '\u03C8');
1144     ESCAPE_STRINGS.put("&omega", '\u03C9');
1145     ESCAPE_STRINGS.put("&thetasym", '\u03D1');
1146     ESCAPE_STRINGS.put("&upsih", '\u03D2');
1147     ESCAPE_STRINGS.put("&piv", '\u03D6');
1148     ESCAPE_STRINGS.put("&bull", '\u2022');
1149     ESCAPE_STRINGS.put("&hellip", '\u2026');
1150     ESCAPE_STRINGS.put("&prime", '\u2032');
1151     ESCAPE_STRINGS.put("&Prime", '\u2033');
1152     ESCAPE_STRINGS.put("&oline", '\u203E');
1153     ESCAPE_STRINGS.put("&frasl", '\u2044');
1154     ESCAPE_STRINGS.put("&weierp", '\u2118');
1155     ESCAPE_STRINGS.put("&image", '\u2111');
1156     ESCAPE_STRINGS.put("&real", '\u211C');
1157     ESCAPE_STRINGS.put("&trade", '\u2122');
1158     ESCAPE_STRINGS.put("&alefsym", '\u2135');
1159     ESCAPE_STRINGS.put("&larr", '\u2190');
1160     ESCAPE_STRINGS.put("&uarr", '\u2191');
1161     ESCAPE_STRINGS.put("&rarr", '\u2192');
1162     ESCAPE_STRINGS.put("&darr", '\u2193');
1163     ESCAPE_STRINGS.put("&harr", '\u2194');
1164     ESCAPE_STRINGS.put("&crarr", '\u21B5');
1165     ESCAPE_STRINGS.put("&lArr", '\u21D0');
1166     ESCAPE_STRINGS.put("&uArr", '\u21D1');
1167     ESCAPE_STRINGS.put("&rArr", '\u21D2');
1168     ESCAPE_STRINGS.put("&dArr", '\u21D3');
1169     ESCAPE_STRINGS.put("&hArr", '\u21D4');
1170     ESCAPE_STRINGS.put("&forall", '\u2200');
1171     ESCAPE_STRINGS.put("&part", '\u2202');
1172     ESCAPE_STRINGS.put("&exist", '\u2203');
1173     ESCAPE_STRINGS.put("&empty", '\u2205');
1174     ESCAPE_STRINGS.put("&nabla", '\u2207');
1175     ESCAPE_STRINGS.put("&isin", '\u2208');
1176     ESCAPE_STRINGS.put("&notin", '\u2209');
1177     ESCAPE_STRINGS.put("&ni", '\u220B');
1178     ESCAPE_STRINGS.put("&prod", '\u220F');
1179     ESCAPE_STRINGS.put("&sum", '\u2211');
1180     ESCAPE_STRINGS.put("&minus", '\u2212');
1181     ESCAPE_STRINGS.put("&lowast", '\u2217');
1182     ESCAPE_STRINGS.put("&radic", '\u221A');
1183     ESCAPE_STRINGS.put("&prop", '\u221D');
1184     ESCAPE_STRINGS.put("&infin", '\u221E');
1185     ESCAPE_STRINGS.put("&ang", '\u2220');
1186     ESCAPE_STRINGS.put("&and", '\u2227');
1187     ESCAPE_STRINGS.put("&or", '\u2228');
1188     ESCAPE_STRINGS.put("&cap", '\u2229');
1189     ESCAPE_STRINGS.put("&cup", '\u222A');
1190     ESCAPE_STRINGS.put("&int", '\u222B');
1191     ESCAPE_STRINGS.put("&there4", '\u2234');
1192     ESCAPE_STRINGS.put("&sim", '\u223C');
1193     ESCAPE_STRINGS.put("&cong", '\u2245');
1194     ESCAPE_STRINGS.put("&asymp", '\u2248');
1195     ESCAPE_STRINGS.put("&ne", '\u2260');
1196     ESCAPE_STRINGS.put("&equiv", '\u2261');
1197     ESCAPE_STRINGS.put("&le", '\u2264');
1198     ESCAPE_STRINGS.put("&ge", '\u2265');
1199     ESCAPE_STRINGS.put("&sub", '\u2282');
1200     ESCAPE_STRINGS.put("&sup", '\u2283');
1201     ESCAPE_STRINGS.put("&nsub", '\u2284');
1202     ESCAPE_STRINGS.put("&sube", '\u2286');
1203     ESCAPE_STRINGS.put("&supe", '\u2287');
1204     ESCAPE_STRINGS.put("&oplus", '\u2295');
1205     ESCAPE_STRINGS.put("&otimes", '\u2297');
1206     ESCAPE_STRINGS.put("&perp", '\u22A5');
1207     ESCAPE_STRINGS.put("&sdot", '\u22C5');
1208     ESCAPE_STRINGS.put("&lceil", '\u2308');
1209     ESCAPE_STRINGS.put("&rceil", '\u2309');
1210     ESCAPE_STRINGS.put("&lfloor", '\u230A');
1211     ESCAPE_STRINGS.put("&rfloor", '\u230B');
1212     ESCAPE_STRINGS.put("&lang", '\u2329');
1213     ESCAPE_STRINGS.put("&rang", '\u232A');
1214     ESCAPE_STRINGS.put("&loz", '\u25CA');
1215     ESCAPE_STRINGS.put("&spades", '\u2660');
1216     ESCAPE_STRINGS.put("&clubs", '\u2663');
1217     ESCAPE_STRINGS.put("&hearts", '\u2665');
1218     ESCAPE_STRINGS.put("&diams", '\u2666');
1219     ESCAPE_STRINGS.put("&quot", '\u0022');
1220     ESCAPE_STRINGS.put("&amp", '\u0026');
1221     ESCAPE_STRINGS.put("&lt", '\u003C');
1222     ESCAPE_STRINGS.put("&gt", '\u003E');
1223     ESCAPE_STRINGS.put("&OElig", '\u0152');
1224     ESCAPE_STRINGS.put("&oelig", '\u0153');
1225     ESCAPE_STRINGS.put("&Scaron", '\u0160');
1226     ESCAPE_STRINGS.put("&scaron", '\u0161');
1227     ESCAPE_STRINGS.put("&Yuml", '\u0178');
1228     ESCAPE_STRINGS.put("&circ", '\u02C6');
1229     ESCAPE_STRINGS.put("&tilde", '\u02DC');
1230     ESCAPE_STRINGS.put("&ensp", '\u2002');
1231     ESCAPE_STRINGS.put("&emsp", '\u2003');
1232     ESCAPE_STRINGS.put("&thinsp", '\u2009');
1233     ESCAPE_STRINGS.put("&zwnj", '\u200C');
1234     ESCAPE_STRINGS.put("&zwj", '\u200D');
1235     ESCAPE_STRINGS.put("&lrm", '\u200E');
1236     ESCAPE_STRINGS.put("&rlm", '\u200F');
1237     ESCAPE_STRINGS.put("&ndash", '\u2013');
1238     ESCAPE_STRINGS.put("&mdash", '\u2014');
1239     ESCAPE_STRINGS.put("&lsquo", '\u2018');
1240     ESCAPE_STRINGS.put("&rsquo", '\u2019');
1241     ESCAPE_STRINGS.put("&sbquo", '\u201A');
1242     ESCAPE_STRINGS.put("&ldquo", '\u201C');
1243     ESCAPE_STRINGS.put("&rdquo", '\u201D');
1244     ESCAPE_STRINGS.put("&bdquo", '\u201E');
1245     ESCAPE_STRINGS.put("&dagger", '\u2020');
1246     ESCAPE_STRINGS.put("&Dagger", '\u2021');
1247     ESCAPE_STRINGS.put("&permil", '\u2030');
1248     ESCAPE_STRINGS.put("&lsaquo", '\u2039');
1249     ESCAPE_STRINGS.put("&rsaquo", '\u203A');
1250     ESCAPE_STRINGS.put("&euro", '\u20AC');
1251 
1252     HEX_LETTERS = new HashSet<Character>(12);
1253 
1254     HEX_LETTERS.add('a');
1255     HEX_LETTERS.add('A');
1256     HEX_LETTERS.add('b');
1257     HEX_LETTERS.add('B');
1258     HEX_LETTERS.add('c');
1259     HEX_LETTERS.add('C');
1260     HEX_LETTERS.add('d');
1261     HEX_LETTERS.add('D');
1262     HEX_LETTERS.add('e');
1263     HEX_LETTERS.add('E');
1264     HEX_LETTERS.add('f');
1265     HEX_LETTERS.add('F');
1266   }
1267 
1268   /**
1269    * <p>
1270    * Replace all the occurences of HTML escape strings with the
1271    * respective characters.
1272    * </p>
1273    * <p>
1274    * The default mode is strict (requiring semicolons).
1275    * </p>
1276    *
1277    * @param s a <code>String</code> value
1278    * @return a <code>String</code> value
1279    * @throws NullPointerException if the input string is null.
1280    */
unescapeHTML(String s)1281   public static final String unescapeHTML(String s) {
1282     return unescapeHTML(s, false);
1283   }
1284 
1285   /**
1286    * Replace all the occurences of HTML escape strings with the
1287    * respective characters.
1288    *
1289    * @param s a <code>String</code> value
1290    * @param emulateBrowsers a <code>Boolean</code> value that tells the method
1291    *     to allow entity refs not terminated with a semicolon to be unescaped.
1292    *     (a quirk of this feature, and some browsers, is that an explicit
1293    *     terminating character is needed - e.g., &lt$ would be unescaped, but
1294    *     not &ltab - see the tests for a more in-depth description of browsers)
1295    * @return a <code>String</code> value
1296    * @throws NullPointerException if the input string is null.
1297    */
unescapeHTML(String s, boolean emulateBrowsers)1298   public static final String unescapeHTML(String s, boolean emulateBrowsers) {
1299 
1300     // See if there are any '&' in the string since that is what we look
1301     // for to escape. If there isn't, then we don't need to escape this string
1302     // Based on similar technique used in the escape function.
1303     int index = s.indexOf('&');
1304     if (index == -1) {
1305       // Nothing to escape. Return the original string.
1306       return s;
1307     }
1308 
1309     // We found an escaped character. Start slow escaping from there.
1310     char[] chars = s.toCharArray();
1311     char[] escaped = new char[chars.length];
1312     System.arraycopy(chars, 0, escaped, 0, index);
1313 
1314     // Note: escaped[pos] = end of the escaped char array.
1315     int pos = index;
1316 
1317     for (int i = index; i < chars.length;) {
1318       if (chars[i] != '&') {
1319         escaped[pos++] = chars[i++];
1320         continue;
1321       }
1322 
1323       // Allow e.g. &#123;
1324       int j = i + 1;
1325       boolean isNumericEntity = false;
1326       if (j < chars.length && chars[j] == '#') {
1327         j++;
1328         isNumericEntity = true;
1329       }
1330 
1331       // if it's numeric, also check for hex
1332       boolean isHexEntity = false;
1333       if (j < chars.length && (chars[j] == 'x' || chars[j] == 'X')) {
1334         j++;
1335         isHexEntity = true;
1336       }
1337 
1338       // Scan until we find a char that is not valid for this sequence.
1339       for (; j < chars.length; j++) {
1340         char ch = chars[j];
1341         boolean isDigit = Character.isDigit(ch);
1342         if (isNumericEntity) {
1343           // non-hex numeric sequence end condition
1344           if (!isHexEntity && !isDigit) {
1345             break;
1346           }
1347           // hex sequence end contition
1348           if (isHexEntity && !isDigit && !HEX_LETTERS.contains(ch)) {
1349             break;
1350           }
1351         }
1352         // anything other than a digit or letter is always an end condition
1353         if (!isDigit && !Character.isLetter(ch)) {
1354           break;
1355         }
1356       }
1357 
1358       boolean replaced = false;
1359       if ((j <= chars.length && emulateBrowsers) ||
1360           (j < chars.length && chars[j] == ';')) {
1361         // Check for &#D; and &#xD; pattern
1362         if (i + 2 < chars.length && s.charAt(i + 1) == '#') {
1363           try {
1364             long charcode = 0;
1365             char ch = s.charAt(i + 2);
1366             if (isHexEntity) {
1367               charcode = Long.parseLong(
1368                   new String(chars, i + 3, j - i - 3), 16);
1369             } else if (Character.isDigit(ch)) {
1370               charcode = Long.parseLong(
1371                   new String(chars, i + 2, j - i - 2));
1372             }
1373             // D800 to DFFF are for UTF16 surrogate pairs, and are not valid HTML entities
1374             // Code points 0xFFFE and 0xFFFF are unicode noncharacters
1375             if ((charcode > 0 && charcode < 0xD800) || (charcode > 0xDFFF && charcode < 0xFFFE)) {
1376               escaped[pos++] = (char) charcode;
1377               replaced = true;
1378             } else if (charcode >= 0x10000 && charcode < 0x110000) {
1379               // These characters are represented as surrogate pairs in UTF16
1380               escaped[pos++] = (char) ((charcode - 0x10000) / 0x400 + 0xD800);
1381               escaped[pos++] = (char) ((charcode - 0x10000) % 0x400 + 0xDC00);
1382               replaced = true;
1383             }
1384           } catch (NumberFormatException ex) {
1385             // Failed, not replaced.
1386           }
1387         } else {
1388           String key = new String(chars, i, j - i);
1389           Character repl = ESCAPE_STRINGS.get(key);
1390           if (repl != null) {
1391             escaped[pos++] = repl;
1392             replaced = true;
1393           }
1394         }
1395         // Skip over ';'
1396         if (j < chars.length && chars[j] == ';') {
1397           j++;
1398         }
1399       }
1400 
1401       if (!replaced) {
1402         // Not a recognized escape sequence, leave as-is
1403         System.arraycopy(chars, i, escaped, pos, j - i);
1404         pos += j - i;
1405       }
1406       i = j;
1407     }
1408     return new String(escaped, 0, pos);
1409   }
1410 
1411   // Escaper for < and > only.
1412   private static final CharEscaper LT_GT_ESCAPE =
1413       new CharEscaperBuilder()
1414         .addEscape('<', "&lt;")
1415         .addEscape('>', "&gt;")
1416         .toEscaper();
1417 
1418   private static final Pattern htmlTagPattern =
1419       Pattern.compile("</?[a-zA-Z][^>]*>");
1420 
1421   /**
1422    * Given a <code>String</code>, returns an equivalent <code>String</code> with
1423    * all HTML tags stripped. Note that HTML entities, such as "&amp;amp;" will
1424    * still be preserved.
1425    */
stripHtmlTags(String string)1426   public static String stripHtmlTags(String string) {
1427     if ((string == null) || "".equals(string)) {
1428       return string;
1429     }
1430     String stripped = htmlTagPattern.matcher(string).replaceAll("");
1431     /*
1432      * Certain inputs result in a well-formed HTML:
1433      * <<X>script>alert(0)<</X>/script> results in <script>alert(0)</script>
1434      * The following step ensures that no HTML can slip through by replacing all
1435      * < and > characters with &lt; and &gt; after HTML tags were stripped.
1436      */
1437     return LT_GT_ESCAPE.escape(stripped);
1438   }
1439 
1440   /**
1441    * We escape some characters in s to be able to insert strings into JavaScript
1442    * code. Also, make sure that we don't write out {@code -->} or
1443    * {@code </script>}, which may close a script tag, or any char in ["'>] which
1444    * might close a tag or attribute if seen inside an attribute.
1445    */
javaScriptEscape(CharSequence s)1446   public static String javaScriptEscape(CharSequence s) {
1447     return javaScriptEscapeHelper(s, false);
1448   }
1449 
1450   /**
1451    * We escape some characters in s to be able to insert strings into JavaScript
1452    * code. Also, make sure that we don't write out {@code -->} or
1453    * {@code </script>}, which may close a script tag, or any char in ["'>] which
1454    * might close a tag or attribute if seen inside an attribute.
1455    * Turns all non-ascii characters into ASCII javascript escape sequences
1456    * (eg \\uhhhh or \ooo).
1457    */
javaScriptEscapeToAscii(CharSequence s)1458   public static String javaScriptEscapeToAscii(CharSequence s) {
1459     return javaScriptEscapeHelper(s, true);
1460   }
1461 
1462   /**
1463    * Represents the type of javascript escaping to perform.  Each enum below
1464    * determines whether to use octal escapes and how to handle quotes.
1465    */
1466   public static enum JsEscapingMode {
1467     /** No octal escapes, pass-through ', and escape " as \". */
1468     JSON,
1469 
1470     /** Octal escapes, escapes ' and " to \42 and \47, respectively. */
1471     EMBEDDABLE_JS,
1472 
1473     /** Octal escapes, escapes ' and " to \' and \". */
1474     MINIMAL_JS
1475   }
1476 
1477   /**
1478    * Helper for javaScriptEscape and javaScriptEscapeToAscii
1479    */
javaScriptEscapeHelper(CharSequence s, boolean escapeToAscii)1480   private static String javaScriptEscapeHelper(CharSequence s,
1481                                                boolean escapeToAscii) {
1482     StringBuilder sb = new StringBuilder(s.length() * 9 / 8);
1483     try {
1484       escapeStringBody(s, escapeToAscii, JsEscapingMode.EMBEDDABLE_JS, sb);
1485     } catch (IOException ex) {
1486       // StringBuilder.append does not throw IOExceptions.
1487       throw new RuntimeException(ex);
1488     }
1489     return sb.toString();
1490   }
1491 
1492   /**
1493    * Appends the javascript string literal equivalent of plainText to the given
1494    * out buffer.
1495    * @param plainText the string to escape.
1496    * @param escapeToAscii true to encode all characters not in ascii [\x20-\x7e]
1497    *   <br>
1498    *   Full escaping of unicode entites isn't required but this makes
1499    *   sure that unicode strings will survive regardless of the
1500    *   content-encoding of the javascript file which is important when
1501    *   we use this function to autogenerated javascript source files.
1502    *   This is disabled by default because it makes non-latin strings very long.
1503    *   <br>
1504    *   If you seem to have trouble with character-encodings, maybe
1505    *   turn this on to see if the problem goes away.  If so, you need
1506    *   to specify a character encoding for your javascript somewhere.
1507    * @param jsEscapingMode determines the type of escaping to perform.
1508    * @param out the buffer to append output to.
1509    */
1510   /*
1511    * To avoid fallthrough, we would have to either use a hybrid switch-case/if
1512    * approach (which would obscure our special handling for ' and "), duplicate
1513    * the content of the default case, or pass a half-dozen parameters to a
1514    * helper method containing the code from the default case.
1515    */
1516   @SuppressWarnings("fallthrough")
escapeStringBody( CharSequence plainText, boolean escapeToAscii, JsEscapingMode jsEscapingMode, Appendable out)1517   public static void escapeStringBody(
1518       CharSequence plainText, boolean escapeToAscii,
1519       JsEscapingMode jsEscapingMode, Appendable out)
1520       throws IOException {
1521     int pos = 0;  // Index just past the last char in plainText written to out.
1522     int len = plainText.length();
1523     for (int codePoint, charCount, i = 0; i < len; i += charCount) {
1524       codePoint = Character.codePointAt(plainText, i);
1525       charCount = Character.charCount(codePoint);
1526 
1527       if (!shouldEscapeChar(codePoint, escapeToAscii, jsEscapingMode)) {
1528         continue;
1529       }
1530 
1531       out.append(plainText, pos, i);
1532       pos = i + charCount;
1533       switch (codePoint) {
1534         case '\b': out.append("\\b"); break;
1535         case '\t': out.append("\\t"); break;
1536         case '\n': out.append("\\n"); break;
1537         case '\f': out.append("\\f"); break;
1538         case '\r': out.append("\\r"); break;
1539         case '\\': out.append("\\\\"); break;
1540         case '"': case '\'':
1541           if (jsEscapingMode == JsEscapingMode.JSON && '\'' == codePoint) {
1542             // JSON does not escape a single quote (and it should be surrounded
1543             // by double quotes).
1544             out.append((char) codePoint);
1545             break;
1546           } else if (jsEscapingMode != JsEscapingMode.EMBEDDABLE_JS) {
1547             out.append('\\').append((char) codePoint);
1548             break;
1549           }
1550           // fall through
1551         default:
1552           if (codePoint >= 0x100 || jsEscapingMode == JsEscapingMode.JSON) {
1553             appendHexJavaScriptRepresentation(codePoint, out);
1554           } else {
1555             // Output the minimal octal encoding.  We can't use an encoding
1556             // shorter than three digits if the next digit is a valid octal
1557             // digit.
1558             boolean pad = i + charCount >= len
1559                 || isOctal(plainText.charAt(i + charCount));
1560             appendOctalJavaScriptRepresentation((char) codePoint, pad, out);
1561           }
1562           break;
1563       }
1564     }
1565     out.append(plainText, pos, len);
1566   }
1567 
1568   /**
1569    * Helper for escapeStringBody, which decides whether to escape a character.
1570    */
shouldEscapeChar(int codePoint, boolean escapeToAscii, JsEscapingMode jsEscapingMode)1571   private static boolean shouldEscapeChar(int codePoint,
1572       boolean escapeToAscii, JsEscapingMode jsEscapingMode) {
1573     // If non-ASCII chars should be escaped, identify non-ASCII code points.
1574     if (escapeToAscii && (codePoint < 0x20 || codePoint > 0x7e)) {
1575       return true;
1576     }
1577 
1578     // If in JSON escaping mode, check JSON *and* JS escaping rules. The JS
1579     // escaping rules will escape more characters than needed for JSON,
1580     // but it is safe to escape any character in JSON.
1581     // TODO(bbavar): Remove unnecessary escaping for JSON, as long as it can be
1582     //               shown that this change in legacy behavior is safe.
1583     if (jsEscapingMode == JsEscapingMode.JSON) {
1584       return mustEscapeCharInJsonString(codePoint)
1585           || mustEscapeCharInJsString(codePoint);
1586     }
1587 
1588     // Finally, just check the default JS escaping rules.
1589     return mustEscapeCharInJsString(codePoint);
1590   }
1591 
1592   /**
1593    * Returns a javascript representation of the character in a hex escaped
1594    * format.
1595    *
1596    * @param codePoint The codepoint to append.
1597    * @param out The buffer to which the hex representation should be appended.
1598    */
appendHexJavaScriptRepresentation( int codePoint, Appendable out)1599   private static void appendHexJavaScriptRepresentation(
1600       int codePoint, Appendable out)
1601       throws IOException {
1602     if (Character.isSupplementaryCodePoint(codePoint)) {
1603       // Handle supplementary unicode values which are not representable in
1604       // javascript.  We deal with these by escaping them as two 4B sequences
1605       // so that they will round-trip properly when sent from java to javascript
1606       // and back.
1607       char[] surrogates = Character.toChars(codePoint);
1608       appendHexJavaScriptRepresentation(surrogates[0], out);
1609       appendHexJavaScriptRepresentation(surrogates[1], out);
1610       return;
1611     }
1612     out.append("\\u")
1613         .append(HEX_CHARS[(codePoint >>> 12) & 0xf])
1614         .append(HEX_CHARS[(codePoint >>> 8) & 0xf])
1615         .append(HEX_CHARS[(codePoint >>> 4) & 0xf])
1616         .append(HEX_CHARS[codePoint & 0xf]);
1617   }
1618 
1619   /**
1620    * Returns a javascript representation of the character in a hex escaped
1621    * format. Although this is a rather specific method, it is made public
1622    * because it is also used by the JSCompiler.
1623    *
1624    * @param ch The character to append.
1625    * @param pad true to force use of the full 3 digit representation.
1626    * @param out The buffer to which the hex representation should be appended.
1627    */
appendOctalJavaScriptRepresentation( char ch, boolean pad, Appendable out)1628   private static void appendOctalJavaScriptRepresentation(
1629       char ch, boolean pad, Appendable out) throws IOException {
1630     if (ch >= 0100
1631         // Be paranoid at the end of a string since someone might call
1632         // this method again with another string segment.
1633         || pad) {
1634       out.append('\\')
1635           .append(OCTAL_CHARS[(ch >>> 6) & 0x7])
1636           .append(OCTAL_CHARS[(ch >>> 3) & 0x7])
1637           .append(OCTAL_CHARS[ch & 0x7]);
1638     } else if (ch >= 010) {
1639       out.append('\\')
1640           .append(OCTAL_CHARS[(ch >>> 3) & 0x7])
1641           .append(OCTAL_CHARS[ch & 0x7]);
1642     } else {
1643       out.append('\\')
1644           .append(OCTAL_CHARS[ch & 0x7]);
1645     }
1646   }
1647 
1648   /**
1649    * Although this is a rather specific method, it is made public
1650    * because it is also used by the JSCompiler.
1651    *
1652    * @see #appendHexJavaScriptRepresentation(int, Appendable)
1653    */
appendHexJavaScriptRepresentation(StringBuilder sb, char c)1654   public static void appendHexJavaScriptRepresentation(StringBuilder sb,
1655                                                        char c) {
1656     try {
1657       appendHexJavaScriptRepresentation(c, sb);
1658     } catch (IOException ex) {
1659       // StringBuilder does not throw IOException.
1660       throw new RuntimeException(ex);
1661     }
1662   }
1663 
1664   /**
1665    * Undo escaping as performed in javaScriptEscape(.)
1666    * Throws an IllegalArgumentException if the string contains
1667    * bad escaping.
1668    */
javaScriptUnescape(String s)1669   public static String javaScriptUnescape(String s) {
1670     StringBuilder sb = new StringBuilder(s.length());
1671     for (int i = 0; i < s.length(); ) {
1672       char c = s.charAt(i);
1673       if (c == '\\') {
1674         i = javaScriptUnescapeHelper(s, i + 1, sb);
1675       } else {
1676         sb.append(c);
1677         i++;
1678       }
1679     }
1680     return sb.toString();
1681   }
1682 
1683   /**
1684    * Looks for an escape code starting at index i of s,
1685    * and appends it to sb.
1686    * @return the index of the first character in s
1687    * after the escape code.
1688    * @throws IllegalArgumentException if the escape code
1689    * is invalid
1690    */
javaScriptUnescapeHelper(String s, int i, StringBuilder sb)1691   private static int javaScriptUnescapeHelper(String s, int i,
1692                                               StringBuilder sb) {
1693     if (i >= s.length()) {
1694       throw new IllegalArgumentException(
1695           "End-of-string after escape character in [" + s + "]");
1696     }
1697 
1698     char c = s.charAt(i++);
1699     switch (c) {
1700       case 'n': sb.append('\n'); break;
1701       case 'r': sb.append('\r'); break;
1702       case 't': sb.append('\t'); break;
1703       case 'b': sb.append('\b'); break;
1704       case 'f': sb.append('\f'); break;
1705       case '\\':
1706       case '\"':
1707       case '\'':
1708       case '>':
1709         sb.append(c);
1710         break;
1711       case '0': case '1': case '2': case '3':
1712       case '4': case '5': case '6': case '7':
1713         --i;  // backup to first octal digit
1714         int nOctalDigits = 1;
1715         int digitLimit = c < '4' ? 3 : 2;
1716         while (nOctalDigits < digitLimit && i + nOctalDigits < s.length()
1717                && isOctal(s.charAt(i + nOctalDigits))) {
1718           ++nOctalDigits;
1719         }
1720         sb.append(
1721             (char) Integer.parseInt(s.substring(i, i + nOctalDigits), 8));
1722         i += nOctalDigits;
1723         break;
1724       case 'x':
1725       case 'u':
1726         String hexCode;
1727         int nHexDigits = (c == 'u' ? 4 : 2);
1728         try {
1729           hexCode = s.substring(i, i + nHexDigits);
1730         } catch (IndexOutOfBoundsException ioobe) {
1731           throw new IllegalArgumentException(
1732               "Invalid unicode sequence [" + s.substring(i) + "] at index " + i
1733               + " in [" + s + "]");
1734         }
1735         int unicodeValue;
1736         try {
1737           unicodeValue = Integer.parseInt(hexCode, 16);
1738         } catch (NumberFormatException nfe) {
1739           throw new IllegalArgumentException(
1740               "Invalid unicode sequence [" + hexCode + "] at index " + i +
1741               " in [" + s + "]");
1742         }
1743         sb.append((char) unicodeValue);
1744         i += nHexDigits;
1745         break;
1746       default:
1747         throw new IllegalArgumentException(
1748             "Unknown escape code [" + c + "] at index " + i + " in [" + s + "]"
1749             );
1750     }
1751 
1752     return i;
1753   }
1754 
1755   // C0 control characters except \t, \n, and \r and 0xFFFE and 0xFFFF
1756   private static final CharMatcher CONTROL_MATCHER = CharMatcher.anyOf(
1757       "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
1758       "\u0008\u000B\u000C\u000E\u000F" +
1759       "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
1760       "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
1761       "\uFFFE\uFFFF");
1762 
1763   /**
1764    * Escape a string that is meant to be embedded in a CDATA section.
1765    * The returned string is guaranteed to be valid CDATA content.
1766    * The syntax of CDATA sections is the following:
1767    * <blockquote>
1768    *   <code>&lt;[!CDATA[...]]&gt;</code>
1769    * </blockquote>
1770    * The only invalid character sequence in a CDATA tag is "]]&gt;".
1771    * If this sequence is present in the input string, we replace
1772    * it by closing the current CDATA field, then write ']]&amp;gt;',
1773    * then reopen a new CDATA section.
1774    */
1775   public static String xmlCDataEscape(String s) {
1776      // Make sure there are no illegal control characters.
1777      s = CONTROL_MATCHER.removeFrom(s);
1778     // Return the original reference if the string doesn't have a match.
1779     int found = s.indexOf("]]>");
1780     if (found == -1) {
1781       return s;
1782     }
1783 
1784     // For each occurrence of "]]>", append a string that adds "]]&gt;" after
1785     // the end of the CDATA which has just been closed, then opens a new CDATA.
1786     StringBuilder sb = new StringBuilder();
1787     int prev = 0;
1788     do {
1789       sb.append(s.substring(prev, found + 3));
1790       sb.append("]]&gt;<![CDATA[");
1791       prev = found + 3;
1792     } while ((found = s.indexOf("]]>", prev)) != -1);
1793     sb.append(s.substring(prev));
1794     return sb.toString();
1795   }
1796 
1797   /**
1798    * We escape some characters in s to be able to insert strings into Java code
1799    *
1800    * @deprecated Use {@link CharEscapers#asciiHtmlEscaper()} and {@link
1801    * CharEscapers#javaCharEscaper()} or {@link CharEscapers#javaStringEscaper()}
1802    * instead. This method combines two forms of escaping in a way that's rarely
1803    * desired.
1804    */
1805   @Deprecated
1806   public static String javaEscape(String s) {
1807     return JAVA_ESCAPE.escape(s);
1808   }
1809 
1810   // Java escaper.
1811   private static final CharEscaper JAVA_ESCAPE =
1812       new CharEscaperBuilder()
1813         .addEscape('\n', "\\n")
1814         .addEscape('\r', "\\r")
1815         .addEscape('\t', "\\t")
1816         .addEscape('\\', "\\\\")
1817         .addEscape('\"', "\\\"")
1818         .addEscape('&', "&amp;")
1819         .addEscape('<', "&lt;")
1820         .addEscape('>', "&gt;")
1821         .addEscape('\'', "\\\'")
1822         .toEscaper();
1823 
1824   /**
1825    * Escapes the special characters from a string so it can be used as part of
1826    * a regex pattern. This method is for use on gnu.regexp style regular
1827    * expressions.
1828    *
1829    * @deprecated Use {@link Pattern#quote(String)} instead. Note that it may not
1830    * be compatible with gnu.regexp style regular expressions.
1831    */
1832   @Deprecated
1833   public static String regexEscape(String s) {
1834     return REGEX_ESCAPE.escape(s);
1835   }
1836 
1837   // Regex escaper escapes all regex characters.
1838   private static final CharEscaper REGEX_ESCAPE =
1839       new CharEscaperBuilder()
1840         .addEscape('(', "\\(")
1841         .addEscape(')', "\\)")
1842         .addEscape('|', "\\|")
1843         .addEscape('*', "\\*")
1844         .addEscape('+', "\\+")
1845         .addEscape('?', "\\?")
1846         .addEscape('.', "\\.")
1847         .addEscape('{', "\\{")
1848         .addEscape('}', "\\}")
1849         .addEscape('[', "\\[")
1850         .addEscape(']', "\\]")
1851         .addEscape('$', "\\$")
1852         .addEscape('^', "\\^")
1853         .addEscape('\\', "\\\\")
1854         .toEscaper();
1855 
1856   /**
1857    *  If you want to preserve the exact
1858    * current (odd) behavior when {@code doStrip} is {@code true}, use
1859    * {@code .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on
1860    * the splitter.
1861    *
1862    * @param in what to process
1863    * @param delimiter the delimiting string
1864    * @return the tokens
1865    * @deprecated see the detailed instructions under
1866    *     {@link #split(String, String, boolean)}
1867    */
1868   @Deprecated
1869   public static LinkedList<String> string2List(
1870       String in, String delimiter, boolean doStrip) {
1871     if (in == null) {
1872       return null;
1873     }
1874 
1875     LinkedList<String> out = new LinkedList<String>();
1876     string2Collection(in, delimiter, doStrip, out);
1877     return out;
1878   }
1879 
1880   /**
1881    * See the detailed instructions under {@link
1882    * #split(String, String, boolean)}. Pass the resulting {@code Iterable} to
1883    * {@link com.google.common.collect.Sets#newHashSet(Iterable)}. If you want to
1884    * preserve the exact current (odd) behavior when {@code doStrip} is {@code
1885    * true}, use {@code
1886    * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the
1887    * splitter.
1888    *
1889    * @param in what to process
1890    * @param delimiter the delimiting string
1891    * @param doStrip to strip the substrings before adding to the list
1892    * @return the tokens
1893    * @deprecated see the detailed instructions under
1894    *     {@link #split(String, String, boolean)}
1895    */
1896   @Deprecated
1897   public static Set<String> string2Set(
1898        String in, String delimiter, boolean doStrip) {
1899     if (in == null) {
1900       return null;
1901     }
1902 
1903     HashSet<String> out = new HashSet<String>();
1904     string2Collection(in, delimiter, doStrip, out);
1905     return out;
1906   }
1907 
1908   /**
1909    * See the detailed instructions under {@link
1910    * #split(String, String, boolean)}. If you want to preserve the exact current
1911    * (odd) behavior when {@code doStrip} is {@code true}, use {@code
1912    * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the
1913    * splitter.
1914    *
1915    * @param in The delimited input string to process
1916    * @param delimiter The string delimiting entries in the input string.
1917    * @param doStrip whether to strip the substrings before adding to the
1918    *          collection
1919    * @param collection The collection to which the strings will be added. If
1920    *          <code>null</code>, a new <code>List</code> will be created.
1921    * @return The collection to which the substrings were added. This is
1922    *         syntactic sugar to allow call chaining.
1923    * @deprecated see the detailed instructions under
1924    *     {@link #split(String, String, boolean)}
1925    */
1926   @Deprecated
1927   public static Collection<String> string2Collection(
1928       String in,
1929       String delimiter,
1930       boolean doStrip,
1931       Collection<String> collection) {
1932     if (in == null) {
1933       return null;
1934     }
1935     if (collection == null) {
1936       collection = new ArrayList<String>();
1937     }
1938     if (delimiter == null || delimiter.length() == 0) {
1939       collection.add(in);
1940       return collection;
1941     }
1942 
1943     int fromIndex = 0;
1944     int pos;
1945     while ((pos = in.indexOf(delimiter, fromIndex)) >= 0) {
1946       String interim = in.substring(fromIndex, pos);
1947       if (doStrip) {
1948         interim = strip(interim);
1949       }
1950       if (!doStrip || interim.length() > 0) {
1951         collection.add(interim);
1952       }
1953 
1954       fromIndex = pos + delimiter.length();
1955     }
1956 
1957     String interim = in.substring(fromIndex);
1958     if (doStrip) {
1959       interim = strip(interim);
1960     }
1961     if (!doStrip || interim.length() > 0) {
1962       collection.add(interim);
1963     }
1964 
1965     return collection;
1966   }
1967 
1968   /**
1969    * This converts a string to a Map. It will first split the string into
1970    * entries using delimEntry. Then each entry is split into a key and a value
1971    * using delimKey. By default we strip the keys. Use doStripEntry to strip
1972    * also the entries.
1973    *
1974    * Note that this method returns a {@link HashMap}, which means that entries
1975    * will be in no particular order. See {@link #stringToOrderedMap}.
1976    *
1977    * @param in the string to be processed
1978    * @param delimEntry delimiter for the entries
1979    * @param delimKey delimiter between keys and values
1980    * @param doStripEntry strip entries before inserting in the map
1981    *
1982    * @return HashMap
1983    */
string2Map( String in, String delimEntry, String delimKey, boolean doStripEntry)1984   public static HashMap<String, String> string2Map(
1985       String in, String delimEntry, String delimKey,
1986       boolean doStripEntry) {
1987     if (in == null) {
1988       return null;
1989     }
1990 
1991     return stringToMapImpl(new HashMap<String, String>(), in, delimEntry,
1992         delimKey, doStripEntry);
1993   }
1994 
1995   /**
1996    * This converts a string to a Map, with entries in the same order as the
1997    * key/value pairs in the input string. It will first split the string into
1998    * entries using delimEntry. Then each entry is split into a key and a value
1999    * using delimKey. By default we strip the keys. Use doStripEntry to strip
2000    * also the entries.
2001    *
2002    * @param in the string to be processed
2003    * @param delimEntry delimiter for the entries
2004    * @param delimKey delimiter between keys and values
2005    * @param doStripEntry strip entries before inserting in the map
2006    *
2007    * @return key/value pairs as a Map, in order
2008    */
stringToOrderedMap( String in, String delimEntry, String delimKey, boolean doStripEntry)2009   public static Map<String, String> stringToOrderedMap(
2010       String in, String delimEntry, String delimKey,
2011       boolean doStripEntry) {
2012     if (in == null) {
2013       return null;
2014     }
2015 
2016     return stringToMapImpl(new LinkedHashMap<String, String>(), in, delimEntry,
2017         delimKey, doStripEntry);
2018   }
2019 
2020   /**
2021    * This adds key/value pairs from the given string to the given Map.
2022    * It will first split the string into entries using delimEntry. Then each
2023    * entry is split into a key and a value using delimKey. By default we
2024    * strip the keys. Use doStripEntry to strip also the entries.
2025    *
2026    * @param out - Map to output into
2027    * @param in - the string to be processed
2028    * @param delimEntry - delimiter for the entries
2029    * @param delimKey - delimiter between keys and values
2030    * @param doStripEntry - strip entries before inserting in the map
2031    * @return out, for caller's convenience
2032    */
stringToMapImpl(T out, String in, String delimEntry, String delimKey, boolean doStripEntry)2033   private static <T extends Map<String, String>> T stringToMapImpl(T out,
2034       String in, String delimEntry, String delimKey, boolean doStripEntry) {
2035 
2036     if (isEmpty(delimEntry) || isEmpty(delimKey)) {
2037       out.put(strip(in), "");
2038       return out;
2039     }
2040 
2041     Iterator<String> it = string2List(in, delimEntry, false).iterator();
2042     int len = delimKey.length();
2043     while (it.hasNext()) {
2044       String entry = it.next();
2045       int pos = entry.indexOf(delimKey);
2046       if (pos > 0) {
2047         String value = entry.substring(pos + len);
2048         if (doStripEntry) {
2049           value = strip(value);
2050         }
2051         out.put(strip(entry.substring(0, pos)), value);
2052       } else {
2053         out.put(strip(entry), "");
2054       }
2055     }
2056 
2057     return out;
2058   }
2059 
2060   /**
2061    * This function concatenates the elements of a Map in a string with form
2062    *  "<key1><sepKey><value1><sepEntry>...<keyN><sepKey><valueN>"
2063    *
2064    * @param in - the map to be converted
2065    * @param sepKey - the separator to put between key and value
2066    * @param sepEntry - the separator to put between map entries
2067    * @return String
2068    * @deprecated create a {@link MapJoiner}, for example {@code
2069    *     Joiner.on(sepEntry).withKeyValueSeparator(sepKey)}. Ensure that your
2070    *     map is non-null and use this map joiner's {@link MapJoiner#join(Map)}
2071    *     method. To preserve behavior exactly, just in-line this method call.
2072    */
map2String( Map<K, V> in, String sepKey, String sepEntry)2073   @Deprecated public static <K, V> String map2String(
2074       Map<K, V> in, String sepKey, String sepEntry) {
2075     return (in == null) ? null : Joiner
2076         .on(sepEntry)
2077         .useForNull("null")
2078         .withKeyValueSeparator(sepKey)
2079         .join(in);
2080   }
2081 
2082   /**
2083    * Given a map, creates and returns a new map in which all keys are the
2084    * lower-cased version of each key.
2085    *
2086    * @param map A map containing String keys to be lowercased
2087    * @throws IllegalArgumentException if the map contains duplicate string keys
2088    *           after lower casing
2089    */
lowercaseKeys(Map<String, V> map)2090   public static <V> Map<String, V> lowercaseKeys(Map<String, V> map) {
2091     Map<String, V> result = new HashMap<String, V>(map.size());
2092     for (Map.Entry<String, V> entry : map.entrySet()) {
2093       String key = entry.getKey();
2094       if (result.containsKey(key.toLowerCase())) {
2095         throw new IllegalArgumentException(
2096             "Duplicate string key in map when lower casing");
2097       }
2098       result.put(key.toLowerCase(), entry.getValue());
2099     }
2100     return result;
2101   }
2102 
2103   /**
2104    * Replaces any string of adjacent whitespace characters with the whitespace
2105    * character " ".
2106    *
2107    * @param str the string you want to munge
2108    * @return String with no more excessive whitespace!
2109    * @deprecated ensure the string is not null and use {@code
2110    *     CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ')}; also consider
2111    *     whether you really want the legacy whitespace definition, or something
2112    *     more standard like {@link CharMatcher#WHITESPACE}.
2113    */
collapseWhitespace(String str)2114   @Deprecated public static String collapseWhitespace(String str) {
2115     return (str == null) ? null
2116         : CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ');
2117   }
2118 
2119   /**
2120    * Replaces any string of matched characters with the supplied string.<p>
2121    *
2122    * This is a more general version of collapseWhitespace.
2123    *
2124    * <pre>
2125    *   E.g. collapse("hello     world", " ", "::")
2126    *   will return the following string: "hello::world"
2127    * </pre>
2128    *
2129    * @param str the string you want to munge
2130    * @param chars all of the characters to be considered for munge
2131    * @param replacement the replacement string
2132    * @return munged and replaced string.
2133    * @deprecated if {@code replacement} is the empty string, use {@link
2134    *     CharMatcher#removeFrom(CharSequence)}; if it is a single character,
2135    *     use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
2136    *     replacement strings use {@link String#replaceAll(String, String)} with
2137    *     a regular expression that matches one or more occurrences of {@code
2138    *     chars}. In all cases you must first ensure that {@code str} is not
2139    *     null.
2140    */
collapse( String str, String chars, String replacement)2141   @Deprecated public static String collapse(
2142       String str, String chars, String replacement) {
2143     if (str == null) {
2144       return null;
2145     }
2146 
2147     StringBuilder newStr = new StringBuilder();
2148 
2149     boolean prevCharMatched = false;
2150     char c;
2151     for (int i = 0; i < str.length(); i++) {
2152       c = str.charAt(i);
2153       if (chars.indexOf(c) != -1) {
2154         // this character is matched
2155         if (prevCharMatched) {
2156           // apparently a string of matched chars, so don't append anything
2157           // to the string
2158           continue;
2159         }
2160         prevCharMatched = true;
2161         newStr.append(replacement);
2162       } else {
2163         prevCharMatched = false;
2164         newStr.append(c);
2165       }
2166     }
2167 
2168     return newStr.toString();
2169   }
2170 
2171   /**
2172    * Returns a string with all sequences of ISO control chars (0x00 to 0x1F and
2173    * 0x7F to 0x9F) replaced by the supplied string.  ISO control characters are
2174    * identified via {@link Character#isISOControl(char)}.
2175    *
2176    * @param str the string you want to strip of ISO control chars
2177    * @param replacement the replacement string
2178    * @return a String with all control characters replaced by the replacement
2179    * string, or null if input is null.
2180    * @deprecated use {@link CharMatcher#JAVA_ISO_CONTROL}. If {@code
2181    *     replacement} is the empty string, use {@link
2182    *     CharMatcher#removeFrom(CharSequence)}; if it is a single character,
2183    *     use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
2184    *     replacement strings use
2185    *     {@code str.replaceAll("\p{Cntrl}+", replacement)}.
2186    *     In all cases you must first ensure that {@code str} is not null.
2187    */
collapseControlChars( String str, String replacement)2188   @Deprecated public static String collapseControlChars(
2189       String str, String replacement) {
2190     /*
2191      * We re-implement the StringUtil.collapse() loop here rather than call
2192      * collapse() with an input String of control chars, because matching via
2193      * isISOControl() is about 10x faster.
2194      */
2195     if (str == null) {
2196       return null;
2197     }
2198 
2199     StringBuilder newStr = new StringBuilder();
2200 
2201     boolean prevCharMatched = false;
2202     char c;
2203     for (int i = 0; i < str.length(); i++) {
2204       c = str.charAt(i);
2205       if (Character.isISOControl(c)) {
2206         // this character is matched
2207         if (prevCharMatched) {
2208           // apparently a string of matched chars, so don't append anything
2209           // to the string
2210           continue;
2211         }
2212         prevCharMatched = true;
2213         newStr.append(replacement);
2214       } else {
2215         prevCharMatched = false;
2216         newStr.append(c);
2217       }
2218     }
2219 
2220     return newStr.toString();
2221   }
2222 
2223   /**
2224    * Read a String of up to maxLength bytes from an InputStream.
2225    *
2226    * <p>Note that this method uses the default platform encoding, and expects
2227    * that encoding to be single-byte, which is not always the case. Its use
2228    * is discouraged. For reading the entire stream (maxLength == -1) you can use:
2229    * <pre>
2230    *   CharStreams.toString(new InputStreamReader(is, Charsets.ISO_8859_1))
2231    * </pre>
2232    * {@code CharStreams} is in the {@code com.google.common.io} package.
2233    *
2234    * <p>For maxLength >= 0 a literal translation would be
2235    * <pre>
2236    *   CharStreams.toString(new InputStreamReader(
2237    *       new LimitInputStream(is, maxLength), Charsets.ISO_8859_1))
2238    * </pre>
2239    * For multi-byte encodings that is broken because the limit could end in
2240    * the middle of the character--it would be better to limit the reader than
2241    * the underlying stream.
2242    *
2243    * @param is input stream
2244    * @param maxLength max number of bytes to read from "is". If this is -1, we
2245    *          read everything.
2246    *
2247    * @return String up to maxLength bytes, read from "is"
2248    * @deprecated see the advice above
2249    */
stream2String(InputStream is, int maxLength)2250   @Deprecated public static String stream2String(InputStream is, int maxLength)
2251       throws IOException {
2252     byte[] buffer = new byte[4096];
2253     StringWriter sw = new StringWriter();
2254     int totalRead = 0;
2255     int read = 0;
2256 
2257     do {
2258       sw.write(new String(buffer, 0, read));
2259       totalRead += read;
2260       read = is.read(buffer, 0, buffer.length);
2261     } while (((-1 == maxLength) || (totalRead < maxLength)) && (read != -1));
2262 
2263     return sw.toString();
2264   }
2265 
2266   /**
2267    * Parse a list of substrings separated by a given delimiter. The delimiter
2268    * can also appear in substrings (just double them):
2269    *
2270    * parseDelimitedString("this|is", '|') returns ["this","is"]
2271    * parseDelimitedString("this||is", '|') returns ["this|is"]
2272    *
2273    * @param list String containing delimited substrings
2274    * @param delimiter Delimiter (anything except ' ' is allowed)
2275    *
2276    * @return String[] A String array of parsed substrings
2277    */
parseDelimitedList(String list, char delimiter)2278   public static String[] parseDelimitedList(String list,
2279                                             char delimiter) {
2280     String delim = "" + delimiter;
2281     // Append a sentinel of delimiter + space
2282     // (see comments below for more info)
2283     StringTokenizer st = new StringTokenizer(list + delim + " ",
2284                                              delim,
2285                                              true);
2286     ArrayList<String> v = new ArrayList<String>();
2287     String lastToken = "";
2288     StringBuilder word = new StringBuilder();
2289 
2290     // We keep a sliding window of 2 tokens
2291     //
2292     // delimiter : delimiter -> append delimiter to current word
2293     //                          and clear most recent token
2294     //                          (so delim : delim : delim will not
2295     //                          be treated as two escaped delims.)
2296     //
2297     // tok : delimiter -> append tok to current word
2298     //
2299     // delimiter : tok -> add current word to list, and clear it.
2300     //                    (We append a sentinel that conforms to this
2301     //                    pattern to make sure we've pushed every parsed token)
2302     while (st.hasMoreTokens()) {
2303       String tok = st.nextToken();
2304       if (lastToken != null) {
2305         if (tok.equals(delim)) {
2306           word.append(lastToken);
2307           if (lastToken.equals(delim)) { tok = null; }
2308         } else {
2309           if (word.length() != 0) {
2310             v.add(word.toString());
2311           }
2312           word.setLength(0);
2313         }
2314       }
2315       lastToken = tok;
2316     }
2317 
2318     return v.toArray(new String[0]);
2319   }
2320 
2321   /**
2322    * Compares two strings, guarding against nulls.
2323    *
2324    * @param nullsAreGreater true if nulls should be greater than any string,
2325    *  false is less than.
2326    * @deprecated use {@link String#CASE_INSENSITIVE_ORDER}, together with
2327    *     {@link com.google.common.collect.Ordering#nullsFirst()} or
2328    *     {@link com.google.common.collect.Ordering#nullsLast()} if
2329    *     needed
2330    */
compareToIgnoreCase(String s1, String s2, boolean nullsAreGreater)2331   @Deprecated public static int compareToIgnoreCase(String s1, String s2,
2332       boolean nullsAreGreater) {
2333     if (s1 == s2) {
2334       return 0; // Either both the same String, or both null
2335     }
2336     if (s1 == null) {
2337       return nullsAreGreater ? 1 : -1;
2338     }
2339     if (s2 == null) {
2340       return nullsAreGreater ? -1 : 1;
2341     }
2342     return s1.compareToIgnoreCase(s2);
2343   }
2344 
2345   /**
2346    * Splits s with delimiters in delimiter and returns the last token
2347    */
lastToken(String s, String delimiter)2348   public static String lastToken(String s, String delimiter) {
2349     return s.substring(CharMatcher.anyOf(delimiter).lastIndexIn(s) + 1);
2350   }
2351 
2352   private static final Pattern characterReferencePattern =
2353       Pattern.compile("&#?[a-zA-Z0-9]{1,8};");
2354 
2355   /**
2356    * Determines if a string contains what looks like an html character
2357    * reference. Useful for deciding whether unescaping is necessary.
2358    */
containsCharRef(String s)2359   public static boolean containsCharRef(String s) {
2360     return characterReferencePattern.matcher(s).find();
2361   }
2362 
2363   /**
2364    * Determines if a string is a Hebrew word. A string is considered to be
2365    * a Hebrew word if {@link #isHebrew(int)} is true for any of its characters.
2366    */
isHebrew(String s)2367   public static boolean isHebrew(String s) {
2368     int len = s.length();
2369     for (int i = 0; i < len; ++i) {
2370       if (isHebrew(s.codePointAt(i))) {
2371         return true;
2372       }
2373     }
2374     return false;
2375   }
2376 
2377   /**
2378    * Determines if a character is a Hebrew character.
2379    */
isHebrew(int codePoint)2380   public static boolean isHebrew(int codePoint) {
2381     return Character.UnicodeBlock.HEBREW.equals(
2382                Character.UnicodeBlock.of(codePoint));
2383   }
2384 
2385   /**
2386    * Determines if a string is a CJK word. A string is considered to be CJK
2387    * if {@link #isCjk(char)} is true for any of its characters.
2388    */
isCjk(String s)2389   public static boolean isCjk(String s) {
2390     int len = s.length();
2391     for (int i = 0; i < len; ++i) {
2392       if (isCjk(s.codePointAt(i))) {
2393         return true;
2394       }
2395     }
2396     return false;
2397   }
2398 
2399   /**
2400    * Unicode code blocks containing CJK characters.
2401    */
2402   private static final Set<Character.UnicodeBlock> CJK_BLOCKS;
2403   static {
2404     Set<Character.UnicodeBlock> set = new HashSet<Character.UnicodeBlock>();
2405     set.add(Character.UnicodeBlock.HANGUL_JAMO);
2406     set.add(Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT);
2407     set.add(Character.UnicodeBlock.KANGXI_RADICALS);
2408     set.add(Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION);
2409     set.add(Character.UnicodeBlock.HIRAGANA);
2410     set.add(Character.UnicodeBlock.KATAKANA);
2411     set.add(Character.UnicodeBlock.BOPOMOFO);
2412     set.add(Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO);
2413     set.add(Character.UnicodeBlock.KANBUN);
2414     set.add(Character.UnicodeBlock.BOPOMOFO_EXTENDED);
2415     set.add(Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS);
2416     set.add(Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS);
2417     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY);
2418     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A);
2419     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS);
2420     set.add(Character.UnicodeBlock.HANGUL_SYLLABLES);
2421     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS);
2422     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS);
2423     set.add(Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS);
2424     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B);
2425     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT);
2426     CJK_BLOCKS = Collections.unmodifiableSet(set);
2427   }
2428 
2429   /**
2430    * Determines if a character is a CJK ideograph or a character typically
2431    * used only in CJK text.
2432    *
2433    * Note: This function cannot handle supplementary characters. To handle all
2434    * Unicode characters, including supplementary characters, use the function
2435    * {@link #isCjk(int)}.
2436    */
isCjk(char ch)2437   public static boolean isCjk(char ch) {
2438     return isCjk((int) ch);
2439   }
2440 
2441   /**
2442    * Determines if a character is a CJK ideograph or a character typically
2443    * used only in CJK text.
2444    */
isCjk(int codePoint)2445   public static boolean isCjk(int codePoint) {
2446     // Time-saving early exit for all Latin-1 characters.
2447     if ((codePoint & 0xFFFFFF00) == 0) {
2448       return false;
2449     }
2450 
2451     return CJK_BLOCKS.contains(Character.UnicodeBlock.of(codePoint));
2452   }
2453 
2454   /**
2455    * Returns the approximate display width of the string, measured in units of
2456    * ascii characters.
2457    *
2458    * @see StringUtil#displayWidth(char)
2459    */
displayWidth(String s)2460   public static int displayWidth(String s) {
2461     // TODO(kevinb): could reimplement this as
2462     // return s.length() * 2 - CharMatcher.SINGLE_WIDTH.countIn(s);
2463     int width = 0;
2464     int len = s.length();
2465     for (int i = 0; i < len; ++i) {
2466       width += displayWidth(s.charAt(i));
2467     }
2468     return width;
2469   }
2470 
2471   /**
2472    * Returns the approximate display width of the character, measured
2473    * in units of ascii characters.
2474    *
2475    * This method should err on the side of caution. By default, characters
2476    * are assumed to have width 2; this covers CJK ideographs, various
2477    * symbols and miscellaneous weird scripts. Given below are some Unicode
2478    * ranges for which it seems safe to assume that no character is
2479    * substantially wider than an ascii character:
2480    *   - Latin, extended Latin, even more extended Latin.
2481    *   - Greek, extended Greek, Cyrillic.
2482    *   - Some symbols (including currency symbols) and punctuation.
2483    *   - Half-width Katakana and Hangul.
2484    *   - Hebrew
2485    *   - Arabic
2486    *   - Thai
2487    * Characters in these ranges are given a width of 1.
2488    *
2489    * IMPORTANT: this function has analogs in C++ (encodingutils.cc,
2490    * named UnicodeCharWidth) and JavaScript
2491    * (java/com/google/ads/common/frontend/adwordsbase/resources/CreateAdUtil.js),
2492    * which need to be updated if you change the implementation here.
2493    */
displayWidth(char ch)2494   public static int displayWidth(char ch) {
2495     if (ch <= '\u04f9' ||   // CYRILLIC SMALL LETTER YERU WITH DIAERESIS
2496         ch == '\u05be' ||   // HEBREW PUNCTUATION MAQAF
2497         (ch >= '\u05d0' && ch <= '\u05ea') ||  // HEBREW LETTER ALEF ... TAV
2498         ch == '\u05F3' ||   // HEBREW PUNCTUATION GERESH
2499         ch == '\u05f4' ||   // HEBREW PUNCTUATION GERSHAYIM
2500         (ch >= '\u0600' && ch <= '\u06ff') || // Block=Arabic
2501         (ch >= '\u0750' && ch <= '\u077f') || // Block=Arabic_Supplement
2502         (ch >= '\ufb50' && ch <= '\ufdff') || // Block=Arabic_Presentation_Forms-A
2503         (ch >= '\ufe70' && ch <= '\ufeff') || // Block=Arabic_Presentation_Forms-B
2504         (ch >= '\u1e00' && ch <= '\u20af') || /* LATIN CAPITAL LETTER A WITH RING BELOW
2505                                                  ... DRACHMA SIGN */
2506         (ch >= '\u2100' && ch <= '\u213a') || // ACCOUNT OF ... ROTATED CAPITAL Q
2507         (ch >= '\u0e00' && ch <= '\u0e7f') || // Thai
2508         (ch >= '\uff61' && ch <= '\uffdc')) { /* HALFWIDTH IDEOGRAPHIC FULL STOP
2509                                                  ... HALFWIDTH HANGUL LETTER I */
2510       return 1;
2511     }
2512     return 2;
2513   }
2514 
2515   /**
2516    * @return a string representation of the given native array.
2517    */
toString(float[] iArray)2518   public static String toString(float[] iArray) {
2519     if (iArray == null) {
2520       return "NULL";
2521     }
2522 
2523     StringBuilder buffer = new StringBuilder();
2524     buffer.append("[");
2525     for (int i = 0; i < iArray.length; i++) {
2526       buffer.append(iArray[i]);
2527       if (i != (iArray.length - 1)) {
2528         buffer.append(", ");
2529       }
2530     }
2531     buffer.append("]");
2532     return buffer.toString();
2533   }
2534 
2535   /**
2536    * @return a string representation of the given native array.
2537    */
toString(long[] iArray)2538   public static String toString(long[] iArray) {
2539     if (iArray == null) {
2540       return "NULL";
2541     }
2542 
2543     StringBuilder buffer = new StringBuilder();
2544     buffer.append("[");
2545     for (int i = 0; i < iArray.length; i++) {
2546       buffer.append(iArray[i]);
2547       if (i != (iArray.length - 1)) {
2548         buffer.append(", ");
2549       }
2550     }
2551     buffer.append("]");
2552     return buffer.toString();
2553   }
2554 
2555   /**
2556    * @return a string representation of the given native array
2557    */
toString(int[] iArray)2558   public static String toString(int[] iArray) {
2559     if (iArray == null) {
2560       return "NULL";
2561     }
2562 
2563     StringBuilder buffer = new StringBuilder();
2564     buffer.append("[");
2565     for (int i = 0; i < iArray.length; i++) {
2566       buffer.append(iArray[i]);
2567       if (i != (iArray.length - 1)) {
2568         buffer.append(", ");
2569       }
2570     }
2571     buffer.append("]");
2572     return buffer.toString();
2573   }
2574 
2575   /**
2576    * @return a string representation of the given array.
2577    */
toString(String[] iArray)2578   public static String toString(String[] iArray) {
2579     if (iArray == null) { return "NULL"; }
2580 
2581     StringBuilder buffer = new StringBuilder();
2582     buffer.append("[");
2583     for (int i = 0; i < iArray.length; i++) {
2584       buffer.append("'").append(iArray[i]).append("'");
2585       if (i != iArray.length - 1) {
2586         buffer.append(", ");
2587       }
2588     }
2589     buffer.append("]");
2590 
2591     return buffer.toString();
2592   }
2593 
2594   /**
2595    * Returns the string, in single quotes, or "NULL". Intended only for
2596    * logging.
2597    *
2598    * @param s the string
2599    * @return the string, in single quotes, or the string "null" if it's null.
2600    */
toString(String s)2601   public static String toString(String s) {
2602     if (s == null) {
2603       return "NULL";
2604     } else {
2605       return new StringBuilder(s.length() + 2).append("'").append(s)
2606                                               .append("'").toString();
2607     }
2608   }
2609 
2610   /**
2611    * @return a string representation of the given native array
2612    */
toString(int[][] iArray)2613   public static String toString(int[][] iArray) {
2614     if (iArray == null) {
2615       return "NULL";
2616     }
2617 
2618     StringBuilder buffer = new StringBuilder();
2619     buffer.append("[");
2620     for (int i = 0; i < iArray.length; i++) {
2621       buffer.append("[");
2622       for (int j = 0; j < iArray[i].length; j++) {
2623         buffer.append(iArray[i][j]);
2624         if (j != (iArray[i].length - 1)) {
2625           buffer.append(", ");
2626         }
2627       }
2628       buffer.append("]");
2629       if (i != iArray.length - 1) {
2630         buffer.append(" ");
2631       }
2632     }
2633     buffer.append("]");
2634     return buffer.toString();
2635   }
2636 
2637   /**
2638    * @return a string representation of the given native array.
2639    */
toString(long[][] iArray)2640   public static String toString(long[][] iArray) {
2641     if (iArray == null) { return "NULL"; }
2642 
2643     StringBuilder buffer = new StringBuilder();
2644     buffer.append("[");
2645     for (int i = 0; i < iArray.length; i++) {
2646       buffer.append("[");
2647       for (int j = 0; j < iArray[i].length; j++) {
2648         buffer.append(iArray[i][j]);
2649         if (j != (iArray[i].length - 1)) {
2650           buffer.append(", ");
2651         }
2652       }
2653       buffer.append("]");
2654       if (i != iArray.length - 1) {
2655         buffer.append(" ");
2656       }
2657     }
2658     buffer.append("]");
2659     return buffer.toString();
2660   }
2661 
2662   /**
2663    * @return a String representation of the given object array.
2664    * The strings are obtained by calling toString() on the
2665    * underlying objects.
2666    */
toString(Object[] obj)2667   public static String toString(Object[] obj) {
2668     if (obj == null) { return "NULL"; }
2669     StringBuilder tmp = new StringBuilder();
2670     tmp.append("[");
2671     for (int i = 0; i < obj.length; i++) {
2672       tmp.append(obj[i].toString());
2673       if (i != obj.length - 1) {
2674         tmp.append(",");
2675       }
2676     }
2677     tmp.append("]");
2678     return tmp.toString();
2679   }
2680 
2681   private static final char[] HEX_CHARS
2682       = { '0', '1', '2', '3', '4', '5', '6', '7',
2683           '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
2684   private static final char[] OCTAL_CHARS = HEX_CHARS;  // ignore the last 8 :)
2685 
2686   /**
2687    * Convert a byte array to a hex-encoding string: "a33bff00..."
2688    *
2689    * @deprecated Use {@link ByteArrays#toHexString}.
2690    */
bytesToHexString(final byte[] bytes)2691   @Deprecated public static String bytesToHexString(final byte[] bytes) {
2692     return ByteArrays.toHexString(bytes);
2693   }
2694 
2695   /**
2696    * Convert a byte array to a hex-encoding string with the specified
2697    * delimiter: "a3&lt;delimiter&gt;3b&lt;delimiter&gt;ff..."
2698    */
bytesToHexString(final byte[] bytes, Character delimiter)2699   public static String bytesToHexString(final byte[] bytes,
2700       Character delimiter) {
2701     StringBuilder hex =
2702       new StringBuilder(bytes.length * (delimiter == null ? 2 : 3));
2703     int nibble1, nibble2;
2704     for (int i = 0; i < bytes.length; i++) {
2705       nibble1 = (bytes[i] >>> 4) & 0xf;
2706       nibble2 = bytes[i] & 0xf;
2707       if (i > 0 && delimiter != null) { hex.append(delimiter.charValue()); }
2708       hex.append(HEX_CHARS[nibble1]);
2709       hex.append(HEX_CHARS[nibble2]);
2710     }
2711     return hex.toString();
2712   }
2713 
2714   /**
2715    * Safely convert the string to uppercase.
2716    * @return upper case representation of the String; or null if
2717    * the input string is null.
2718    */
toUpperCase(String src)2719   public static String toUpperCase(String src) {
2720     if (src == null) {
2721       return null;
2722     } else {
2723       return src.toUpperCase();
2724     }
2725   }
2726 
2727   /**
2728    * Safely convert the string to lowercase.
2729    * @return lower case representation of the String; or null if
2730    * the input string is null.
2731    */
toLowerCase(String src)2732   public static String toLowerCase(String src) {
2733     if (src == null) {
2734       return null;
2735     } else {
2736       return src.toLowerCase();
2737     }
2738   }
2739 
2740   private static final Pattern dbSpecPattern =
2741       Pattern.compile("(.*)\\{(\\d+),(\\d+)\\}(.*)");
2742 
2743   /**
2744    * @param dbSpecComponent a single component of a DBDescriptor spec
2745    * (e.g. the host or database component). The expected format of the string is:
2746    * <br>
2747    *             <center>(prefix){(digits),(digits)}(suffix)</center>
2748    * </br>
2749    * @return a shard expansion of the given String.
2750    * Note that unless the pattern is matched exactly, no expansion is
2751    * performed and the original string is returned unaltered.
2752    * For example, 'db{0,1}.adz' is expanded into 'db0.adz, db1.adz'.
2753    * Note that this method is added to StringUtil instead of
2754    * DBDescriptor to better encapsulate the choice of regexp implementation.
2755    * @throws IllegalArgumentException if the string does not parse.
2756    */
expandShardNames(String dbSpecComponent)2757   public static String expandShardNames(String dbSpecComponent)
2758       throws IllegalArgumentException, IllegalStateException {
2759 
2760     Matcher matcher = dbSpecPattern.matcher(dbSpecComponent);
2761     if (matcher.find()) {
2762       try {
2763         String prefix = dbSpecComponent.substring(
2764           matcher.start(1), matcher.end(1));
2765         int minShard =
2766           Integer.parseInt(
2767             dbSpecComponent.substring(
2768               matcher.start(2), matcher.end(2)));
2769         int maxShard =
2770           Integer.parseInt(
2771             dbSpecComponent.substring(
2772               matcher.start(3), matcher.end(3)));
2773         String suffix = dbSpecComponent.substring(
2774           matcher.start(4), matcher.end(4));
2775         //Log2.logEvent(prefix + " " + minShard + " " + maxShard + " " + suffix);
2776         if (minShard > maxShard) {
2777           throw new IllegalArgumentException(
2778             "Maximum shard must be greater than or equal to " +
2779             "the minimum shard");
2780         }
2781         StringBuilder tmp = new StringBuilder();
2782         for (int shard = minShard; shard <= maxShard; shard++) {
2783           tmp.append(prefix).append(shard).append(suffix);
2784           if (shard != maxShard) {
2785             tmp.append(",");
2786           }
2787         }
2788         return tmp.toString();
2789       } catch (NumberFormatException nfex) {
2790         throw new IllegalArgumentException(
2791           "Malformed DB specification component: " + dbSpecComponent);
2792       }
2793     } else {
2794       return dbSpecComponent;
2795     }
2796   }
2797 
2798 
2799   /**
2800   * Returns a string that is equivalent to the specified string with its
2801   * first character converted to uppercase as by {@link String#toUpperCase()}.
2802   * The returned string will have the same value as the specified string if
2803   * its first character is non-alphabetic, if its first character is already
2804   * uppercase, or if the specified string is of length 0.
2805   *
2806   * <p>For example:
2807   * <pre>
2808   *    capitalize("foo bar").equals("Foo bar");
2809   *    capitalize("2b or not 2b").equals("2b or not 2b")
2810   *    capitalize("Foo bar").equals("Foo bar");
2811   *    capitalize("").equals("");
2812   * </pre>
2813   *
2814   * @param s the string whose first character is to be uppercased
2815   * @return a string equivalent to <tt>s</tt> with its first character
2816   *     converted to uppercase
2817   * @throws NullPointerException if <tt>s</tt> is null
2818   */
capitalize(String s)2819   public static String capitalize(String s) {
2820     if (s.length() == 0) {
2821       return s;
2822     }
2823     char first = s.charAt(0);
2824     char capitalized = Character.toUpperCase(first);
2825     return (first == capitalized)
2826         ? s
2827         : capitalized + s.substring(1);
2828   }
2829 
2830   /**
2831    * Examine a string to see if it starts with a given prefix (case
2832    * insensitive). Just like String.startsWith() except doesn't
2833    * respect case. Strings are compared in the same way as in
2834    * {@link String#equalsIgnoreCase}.
2835    *
2836    * @param str the string to examine
2837    * @param prefix the prefix to look for
2838    * @return a boolean indicating if str starts with prefix (case insensitive)
2839    */
startsWithIgnoreCase(String str, String prefix)2840   public static boolean startsWithIgnoreCase(String str, String prefix) {
2841     return str.regionMatches(true, 0, prefix, 0, prefix.length());
2842   }
2843 
2844   /**
2845    * Examine a string to see if it ends with a given suffix (case
2846    * insensitive). Just like String.endsWith() except doesn't respect
2847    * case. Strings are compared in the same way as in
2848    * {@link String#equalsIgnoreCase}.
2849    *
2850    * @param str the string to examine
2851    * @param suffix the suffix to look for
2852    * @return a boolean indicating if str ends with suffix (case insensitive)
2853    */
endsWithIgnoreCase(String str, String suffix)2854   public static boolean endsWithIgnoreCase(String str, String suffix) {
2855     int len = suffix.length();
2856     return str.regionMatches(true, str.length() - len, suffix, 0, len);
2857   }
2858 
2859   /**
2860    * @param c one codePoint
2861    * @return the number of bytes needed to encode this codePoint in UTF-8
2862    */
bytesUtf8(int c)2863   private static int bytesUtf8(int c) {
2864     if (c < 0x80) {
2865       return 1;
2866     } else if (c < 0x00800) {
2867       return 2;
2868     } else if (c < 0x10000) {
2869       return 3;
2870     } else if (c < 0x200000) {
2871       return 4;
2872 
2873     // RFC 3629 forbids the use of UTF-8 for codePoint greater than 0x10FFFF,
2874     // so if the caller respects this RFC, this should not happen
2875     } else if (c < 0x4000000) {
2876       return 5;
2877     } else {
2878       return 6;
2879     }
2880   }
2881 
2882   /**
2883    * @param str a string
2884    * @return the number of bytes required to represent this string in UTF-8
2885    */
bytesStorage(String str)2886   public static int bytesStorage(String str) {
2887     // offsetByCodePoint has a bug if its argument is the result of a
2888     // call to substring. To avoid this, we create a new String
2889     // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
2890     String s = new String(str);
2891 
2892     int len = 0;
2893     for (int i = 0; i < s.length(); i = s.offsetByCodePoints(i, 1)) {
2894       len += bytesUtf8(s.codePointAt(i));
2895     }
2896     return len;
2897   }
2898 
2899   /**
2900    * @param str a string
2901    * @param maxbytes
2902    * @return the beginning of the string, so that it uses less than
2903    *     maxbytes bytes in UTF-8
2904    * @throws IndexOutOfBoundsException if maxbytes is negative
2905    */
truncateStringForUtf8Storage(String str, int maxbytes)2906   public static String truncateStringForUtf8Storage(String str, int maxbytes) {
2907     if (maxbytes < 0) {
2908       throw new IndexOutOfBoundsException();
2909     }
2910 
2911     // offsetByCodePoint has a bug if its argument is the result of a
2912     // call to substring. To avoid this, we create a new String
2913     // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
2914     // TODO(cquinn): should be fixed as of 1.5.0_01
2915     String s = new String(str);
2916 
2917     int codepoints = 0;
2918     int bytesUsed = 0;
2919     for (codepoints = 0; codepoints < s.length();
2920         codepoints = s.offsetByCodePoints(codepoints, 1)) {
2921       int glyphBytes = StringUtil.bytesUtf8(s.codePointAt(codepoints));
2922       if (bytesUsed + glyphBytes > maxbytes) {
2923         break;
2924       }
2925       bytesUsed += glyphBytes;
2926     }
2927     return s.substring(0, codepoints);
2928   }
2929 
2930   /**
2931    * If the given string is of length {@code maxLength} or less, then it is
2932    * returned as is.
2933    * If the string is longer than {@code maxLength}, the returned string is
2934    * truncated before the last space character on or before
2935    * {@code source.charAt(maxLength)}. If the string has no spaces, the
2936    * returned string is truncated to {@code maxLength}.
2937    *
2938    * @param source the string to truncate if necessary
2939    * @param maxLength
2940    * @return the original string if its length is less than or equal to
2941    *     maxLength, otherwise a truncated string as mentioned above
2942    */
truncateIfNecessary(String source, int maxLength)2943   public static String truncateIfNecessary(String source, int maxLength) {
2944     if (source.length() <= maxLength) {
2945       return source;
2946     }
2947     String str = unicodePreservingSubstring(source, 0, maxLength);
2948 
2949     @SuppressWarnings("deprecation") // we'll make this go away before that does
2950     CharMatcher whitespaceMatcher = CharMatcher.LEGACY_WHITESPACE;
2951     String truncated = whitespaceMatcher.trimTrailingFrom(str);
2952 
2953     // We may have had multiple spaces at maxLength, which were stripped away
2954     if (truncated.length() < maxLength) {
2955       return truncated;
2956     }
2957     // We have a truncated string of length maxLength. If the next char was a
2958     // space, we truncated at a word boundary, so we can return immediately
2959     if (Character.isSpaceChar(source.charAt(maxLength))) {
2960       return truncated;
2961     }
2962     // We truncated in the middle of the word. Try to truncate before
2963     // the last space, if it exists. Otherwise, return the truncated string
2964     for (int i = truncated.length() - 1; i >= 0; --i) {
2965       if (Character.isSpaceChar(truncated.charAt(i))) {
2966         String substr = truncated.substring(0, i);
2967         return whitespaceMatcher.trimTrailingFrom(substr);
2968       }
2969     }
2970     return truncated;
2971   }
2972 
2973   /**
2974    * If this given string is of length {@code maxLength} or less, it will
2975    * be returned as-is.
2976    * Otherwise it will be trucated to {@code maxLength}, regardless of whether
2977    * there are any space characters in the String. If an ellipsis is requested
2978    * to be appended to the truncated String, the String will be truncated so
2979    * that the ellipsis will also fit within maxLength.
2980    * If no truncation was necessary, no ellipsis will be added.
2981    *
2982    * @param source the String to truncate if necessary
2983    * @param maxLength the maximum number of characters to keep
2984    * @param addEllipsis if true, and if the String had to be truncated,
2985    *     add "..." to the end of the String before returning. Additionally,
2986    *     the ellipsis will only be added if maxLength is greater than 3.
2987    * @return the original string if its length is less than or equal to
2988    *     maxLength, otherwise a truncated string as mentioned above
2989    */
truncateAtMaxLength(String source, int maxLength, boolean addEllipsis)2990   public static String truncateAtMaxLength(String source, int maxLength,
2991       boolean addEllipsis) {
2992 
2993     if (source.length() <= maxLength) {
2994       return source;
2995     }
2996     if (addEllipsis && maxLength > 3) {
2997       return unicodePreservingSubstring(source, 0, maxLength - 3) + "...";
2998     }
2999     return unicodePreservingSubstring(source, 0, maxLength);
3000   }
3001 
3002   /**
3003    * Normalizes {@code index} such that it respects Unicode character
3004    * boundaries in {@code str}.
3005    *
3006    * <p>If {@code index} is the low surrogate of a unicode character,
3007    * the method returns {@code index - 1}. Otherwise, {@code index} is
3008    * returned.
3009    *
3010    * <p>In the case in which {@code index} falls in an invalid surrogate pair
3011    * (e.g. consecutive low surrogates, consecutive high surrogates), or if
3012    * if it is not a valid index into {@code str}, the original value of
3013    * {@code index} is returned.
3014    *
3015    * @param str the String
3016    * @param index the index to be normalized
3017    * @return a normalized index that does not split a Unicode character
3018    */
unicodePreservingIndex(String str, int index)3019   public static int unicodePreservingIndex(String str, int index) {
3020     if (index > 0 && index < str.length()) {
3021       if (Character.isHighSurrogate(str.charAt(index - 1)) &&
3022           Character.isLowSurrogate(str.charAt(index))) {
3023         return index - 1;
3024       }
3025     }
3026     return index;
3027   }
3028 
3029   /**
3030    * Returns a substring of {@code str} that respects Unicode character
3031    * boundaries.
3032    *
3033    * <p>The string will never be split between a [high, low] surrogate pair,
3034    * as defined by {@link Character#isHighSurrogate} and
3035    * {@link Character#isLowSurrogate}.
3036    *
3037    * <p>If {@code begin} or {@code end} are the low surrogate of a unicode
3038    * character, it will be offset by -1.
3039    *
3040    * <p>This behavior guarantees that
3041    * {@code str.equals(StringUtil.unicodePreservingSubstring(str, 0, n) +
3042    *     StringUtil.unicodePreservingSubstring(str, n, str.length())) } is
3043    * true for all {@code n}.
3044    * </pre>
3045    *
3046    * <p>This means that unlike {@link String#substring(int, int)}, the length of
3047    * the returned substring may not necessarily be equivalent to
3048    * {@code end - begin}.
3049    *
3050    * @param str the original String
3051    * @param begin the beginning index, inclusive
3052    * @param end the ending index, exclusive
3053    * @return the specified substring, possibly adjusted in order to not
3054    *   split unicode surrogate pairs
3055    * @throws IndexOutOfBoundsException if the {@code begin} is negative,
3056    *   or {@code end} is larger than the length of {@code str}, or
3057    *   {@code begin} is larger than {@code end}
3058    */
unicodePreservingSubstring( String str, int begin, int end)3059   public static String unicodePreservingSubstring(
3060       String str, int begin, int end) {
3061     return str.substring(unicodePreservingIndex(str, begin),
3062         unicodePreservingIndex(str, end));
3063   }
3064 
3065   /**
3066    * Equivalent to:
3067    *
3068    * <pre>
3069    * {@link #unicodePreservingSubstring(String, int, int)}(
3070    *     str, begin, str.length())
3071    * </pre>
3072    */
unicodePreservingSubstring(String str, int begin)3073   public static String unicodePreservingSubstring(String str, int begin) {
3074     return unicodePreservingSubstring(str, begin, str.length());
3075   }
3076 
3077   /**
3078    * True iff the given character needs to be escaped in a javascript string
3079    * literal.
3080    * <p>
3081    * We need to escape the following characters in javascript string literals.
3082    * <dl>
3083    * <dt> \           <dd> the escape character
3084    * <dt> ', "        <dd> string delimiters.
3085    *                       TODO(msamuel): what about backticks (`) which are
3086    *                       non-standard but recognized as attribute delimiters.
3087    * <dt> &, <, >, =  <dd> so that a string literal can be embedded in XHTML
3088    *                       without further escaping.
3089    * </dl>
3090    * TODO(msamuel): If we're being paranoid, should we escape + to avoid UTF-7
3091    * attacks?
3092    * <p>
3093    * Unicode format control characters (category Cf) must be escaped since they
3094    * are removed by javascript parser in a pre-lex pass.
3095    * <br>According to EcmaScript 262 Section 7.1:
3096    * <blockquote>
3097    *     The format control characters can occur anywhere in the source text of
3098    *     an ECMAScript program. These characters are removed from the source
3099    *     text before applying the lexical grammar.
3100    * </blockquote>
3101    * <p>
3102    * Additionally, line terminators are not allowed to appear inside strings
3103    * and Section 7.3 says
3104    * <blockquote>
3105    *     The following characters are considered to be line terminators:<pre>
3106    *         Code Point Value   Name                  Formal Name
3107    *         \u000A             Line Feed             [LF]
3108    *         \u000D             Carriage Return       [CR]
3109    *         \u2028             Line separator        [LS]
3110    *         \u2029             Paragraph separator   [PS]
3111    * </pre></blockquote>
3112    *
3113    * @param codepoint a char instead of an int since the javascript language
3114    *    does not support extended unicode.
3115    */
mustEscapeCharInJsString(int codepoint)3116   static boolean mustEscapeCharInJsString(int codepoint) {
3117     return JS_ESCAPE_CHARS.contains(codepoint);
3118   }
3119 
3120   /**
3121    * True iff the given character needs to be escaped in a JSON string literal.
3122    * <p>
3123    * We need to escape the following characters in JSON string literals.
3124    * <dl>
3125    * <dt> \           <dd> the escape character
3126    * <dt> "           <dd> string delimiter
3127    * <dt> 0x00 - 0x1F <dd> control characters
3128    * </dl>
3129    * <p>
3130    * See EcmaScript 262 Section 15.12.1 for the full JSON grammar.
3131    */
mustEscapeCharInJsonString(int codepoint)3132   static boolean mustEscapeCharInJsonString(int codepoint) {
3133     return JSON_ESCAPE_CHARS.contains(codepoint);
3134   }
3135 
3136   /**
3137    * Builds a small set of code points.
3138    * {@code com.google.common.base} cannot depend on ICU4J, thus avoiding ICU's
3139    * {@code UnicodeSet}.
3140    * For all other purposes, please use {@code com.ibm.icu.text.UnicodeSet}.
3141    */
3142   private static class UnicodeSetBuilder {
3143     Set<Integer> codePointSet = new HashSet<Integer>();
3144 
addCodePoint(int c)3145     UnicodeSetBuilder addCodePoint(int c) {
3146       codePointSet.add(c);
3147       return this;
3148     }
3149 
addRange(int from, int to)3150     UnicodeSetBuilder addRange(int from, int to) {
3151       for (int i = from; i <= to; i++) {
3152         codePointSet.add(i);
3153       }
3154       return this;
3155     }
3156 
create()3157     Set<Integer> create() {
3158       return codePointSet;
3159     }
3160   }
3161 
3162   private static final Set<Integer> JS_ESCAPE_CHARS = new UnicodeSetBuilder()
3163       // All characters in the class of format characters, [:Cf:].
3164       // Source: http://unicode.org/cldr/utility/list-unicodeset.jsp.
3165       .addCodePoint(0xAD)
3166       .addRange(0x600, 0x603)
3167       .addCodePoint(0x6DD)
3168       .addCodePoint(0x070F)
3169       .addRange(0x17B4, 0x17B5)
3170       .addRange(0x200B, 0x200F)
3171       .addRange(0x202A, 0x202E)
3172       .addRange(0x2060, 0x2064)
3173       .addRange(0x206A, 0x206F)
3174       .addCodePoint(0xFEFF)
3175       .addRange(0xFFF9, 0xFFFB)
3176       .addRange(0x0001D173, 0x0001D17A)
3177       .addCodePoint(0x000E0001)
3178       .addRange(0x000E0020, 0x000E007F)
3179       // Plus characters mentioned in the docs of mustEscapeCharInJsString().
3180       .addCodePoint(0x0000)
3181       .addCodePoint(0x000A)
3182       .addCodePoint(0x000D)
3183       .addRange(0x2028, 0x2029)
3184       .addCodePoint(0x0085)
3185       .addCodePoint(Character.codePointAt("'", 0))
3186       .addCodePoint(Character.codePointAt("\"", 0))
3187       .addCodePoint(Character.codePointAt("&", 0))
3188       .addCodePoint(Character.codePointAt("<", 0))
3189       .addCodePoint(Character.codePointAt(">", 0))
3190       .addCodePoint(Character.codePointAt("=", 0))
3191       .addCodePoint(Character.codePointAt("\\", 0))
3192       .create();
3193 
3194   private static final Set<Integer> JSON_ESCAPE_CHARS = new UnicodeSetBuilder()
3195       .addCodePoint(Character.codePointAt("\"", 0))
3196       .addCodePoint(Character.codePointAt("\\", 0))
3197       .addRange(0x0000, 0x001F)
3198       .create();
3199 
3200   /**
3201    * <b>To be deprecated:</b> use {@link CharEscapers#xmlEscaper()} instead.
3202    */
xmlEscape(String s)3203   public static String xmlEscape(String s) {
3204     return CharEscapers.xmlEscaper().escape(s);
3205   }
3206 
3207   /**
3208    * <b>To be deprecated:</b> use {@link CharEscapers#asciiHtmlEscaper()} instead.
3209    */
htmlEscape(String s)3210   public static String htmlEscape(String s) {
3211     return CharEscapers.asciiHtmlEscaper().escape(s);
3212   }
3213 }