• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2000 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.android.mail.lib.base;
17 
18 import static com.android.mail.lib.base.Preconditions.checkArgument;
19 
20 import com.google.common.base.Joiner;
21 import com.google.common.base.Joiner.MapJoiner;
22 
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.StringWriter;
26 import java.util.ArrayList;
27 import java.util.Collection;
28 import java.util.Collections;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.Iterator;
32 import java.util.LinkedHashMap;
33 import java.util.LinkedList;
34 import java.util.List;
35 import java.util.Map;
36 import java.util.Set;
37 import java.util.StringTokenizer;
38 import java.util.regex.Matcher;
39 import java.util.regex.Pattern;
40 
41 /**
42  * Static utility methods and constants pertaining to {@code String} or {@code
43  * CharSequence} instances.
44  */
45 public final class StringUtil {
StringUtil()46   private StringUtil() {} // COV_NF_LINE
47 
48   /**
49    * A completely arbitrary selection of eight whitespace characters. See
50    * <a href="http://go/white+space">this spreadsheet</a> for more details
51    * about whitespace characters.
52    *
53    * @deprecated Rewrite your code to use {@link CharMatcher#WHITESPACE}, or
54    *     consider the precise set of characters you want to match and construct
55    *     the right explicit {@link CharMatcher} or {@link String} for your own
56    *     purposes.
57    */
58   @Deprecated
59   public static final String WHITE_SPACES = " \r\n\t\u3000\u00A0\u2007\u202F";
60 
61   /** A string containing the carriage return and linefeed characters. */
62   public static final String LINE_BREAKS = "\r\n";
63 
64   /**
65    * Old location of {@link Strings#isNullOrEmpty}; this method will be
66    * deprecated soon.
67    */
isEmpty(String string)68   public static boolean isEmpty(String string) {
69     return Strings.isNullOrEmpty(string);
70   }
71 
72   /**
73    * Returns {@code true} if the given string is null, empty, or comprises only
74    * whitespace characters, as defined by {@link CharMatcher#WHITESPACE}.
75    *
76    * <p><b>Warning:</b> there are many competing definitions of "whitespace";
77    * please see <a href="http://go/white+space">this spreadsheet</a> for
78    * details.
79    *
80    * @param string the string reference to check
81    * @return {@code true} if {@code string} is null, empty, or consists of
82    *     whitespace characters only
83    */
isEmptyOrWhitespace(String string)84   public static boolean isEmptyOrWhitespace(String string) {
85     return string == null || CharMatcher.WHITESPACE.matchesAllOf(string);
86   }
87 
88   /**
89    * Old location of {@link Strings#nullToEmpty}; this method will be
90    * deprecated soon.
91    */
makeSafe(String string)92   public static String makeSafe(String string) {
93     return Strings.nullToEmpty(string);
94   }
95 
96   /**
97    * Old location of {@link Strings#emptyToNull}; this method will be
98    * deprecated soon.
99    */
toNullIfEmpty(String string)100   public static String toNullIfEmpty(String string) {
101     return Strings.emptyToNull(string);
102   }
103 
104   /**
105    * Returns the given string if it is nonempty and contains at least one
106    * non-whitespace character; {@code null} otherwise. See comment in {@link
107    * #isEmptyOrWhitespace} on the definition of whitespace.
108    *
109    * @param string the string to test and possibly return
110    * @return {@code null} if {@code string} is null, empty, or contains only
111    *     whitespace characters; {@code string} itself otherwise
112    */
toNullIfEmptyOrWhitespace( String string)113   public static String toNullIfEmptyOrWhitespace(
114       String string) {
115     return isEmptyOrWhitespace(string) ? null : string;
116   }
117 
118   /**
119    * Old location of {@link Strings#repeat}; this method will be deprecated
120    * soon.
121    */
repeat(String string, int count)122   public static String repeat(String string, int count) {
123     return Strings.repeat(string, count);
124   }
125 
126   /**
127    * Return the first index in the string of any of the specified characters,
128    * starting at a given index, or {@code -1} if none of the characters is
129    * present.
130    *
131    * @param string the non-null character sequence to look in
132    * @param chars a non-null character sequence containing the set of characters
133    *     to look for. If empty, this method will find no matches and return
134    *     {@code -1}
135    * @param fromIndex the index of the first character to examine in the input
136    *     string. If negative, the entire string will be searched. If greater
137    *     than or equal to the string length, no characters will be searched and
138    *     {@code -1} will be returned.
139    * @return the index of the first match, or {@code -1} if no match was found.
140    *     Guaranteed to be either {@code -1} or a number greater than or equal to
141    *     {@code fromIndex}
142    * @throws NullPointerException if any argument is null
143    */
144   // author: pault
indexOfChars( CharSequence string, CharSequence chars, int fromIndex)145   public static int indexOfChars(
146       CharSequence string, CharSequence chars, int fromIndex) {
147     if (fromIndex >= string.length()) {
148       return -1;
149     }
150 
151     /*
152      * Prepare lookup structures for the characters. TODO(pault): This loop
153      * could be factored into another method to allow caching of the resulting
154      * struct if a use-case of very large character sets exists.
155      */
156     Set<Character> charSet = Collections.emptySet();
157     boolean[] charArray = new boolean[128];
158     for (int i = 0; i < chars.length(); i++) {
159       char c = chars.charAt(i);
160       if (c < 128) {
161         charArray[c] = true;
162       } else {
163         if (charSet.isEmpty()) {
164           charSet = new HashSet<Character>();
165         }
166         charSet.add(c);
167       }
168     }
169 
170     // Scan the string for matches
171     for (int i = Math.max(fromIndex, 0); i < string.length(); i++) {
172       char c = string.charAt(i);
173       if (c < 128) {
174         if (charArray[c]) {
175           return i;
176         }
177       } else if (charSet.contains(c)) {
178         return i;
179       }
180     }
181     return -1;
182   }
183 
184 /*
185  * -------------------------------------------------------------------
186  * This marks the end of the code that has been written or rewritten
187  * in 2008 to the quality standards of the Java core libraries group.
188  * Code below this point is still awaiting cleanup (you can help!).
189  * See http://wiki/Nonconf/JavaCoreLibrariesStandards.
190  * -------------------------------------------------------------------
191  */
192 
193 
194   /**
195    * @param str the string to split.  Must not be null.
196    * @param delims the delimiter characters. Each character in the
197    *        string is individually treated as a delimiter.
198    * @return an array of tokens. Will not return null. Individual tokens
199    *        do not have leading/trailing whitespace removed.
200    * @deprecated see the detailed instructions under
201    *     {@link #split(String, String, boolean)}
202    */
203   @Deprecated
split(String str, String delims)204   public static String[] split(String str, String delims) {
205     return split(str, delims, false);
206   }
207 
208   /**
209    * This method is deprecated because it is too inflexible, providing
210    * only a very specific set of behaviors that almost never matches exactly
211    * what you intend. Prefer using a {@link Splitter}, which is more flexible
212    * and consistent in the way it handles trimming and empty tokens.
213    *
214    * <ul>
215    * <li>Create a {@link Splitter} using {@link Splitter#on(CharMatcher)} such
216    *     as {@code Splitter.on(CharMatcher.anyOf(delims))}.
217    * <li><i>If</i> you need whitespace trimmed from the ends of each segment,
218    *     adding {@code .trimResults()} to your splitter definition should work
219    *     in most cases. To match the exact behavior of this method, use
220    *     {@code .trimResults(CharMatcher.inRange('\0', ' '))}.
221    * <li>This method silently ignores empty tokens in the input, but allows
222    *     empty tokens to appear in the output if {@code trimTokens} is
223    *     {@code true}. Adding {@code .omitEmptyStrings()} to your splitter
224    *     definition will filter empty tokens out but will do so <i>after</i>
225    *     having performed trimming. If you absolutely require this method's
226    *     behavior in this respect, Splitter is not able to match it.
227    * <li>If you need the result as an array, use {@link
228    *     com.google.common.collect.Iterables#toArray(Iterable, Class)} on the
229    *     {@code Iterable<String>} returned by {@link Splitter#split}.
230    * </ul>
231    *
232    * @param str the string to split.  Must not be null.
233    * @param delims the delimiter characters. Each character in the string
234    *        is individually treated as a delimiter.
235    * @param trimTokens if true, leading/trailing whitespace is removed
236    *        from the tokens
237    * @return an array of tokens. Will not return null.
238    * @deprecated
239    */
240   @Deprecated
split( String str, String delims, boolean trimTokens)241   public static String[] split(
242       String str, String delims, boolean trimTokens) {
243     StringTokenizer tokenizer = new StringTokenizer(str, delims);
244     int n = tokenizer.countTokens();
245     String[] list = new String[n];
246     for (int i = 0; i < n; i++) {
247       if (trimTokens) {
248         list[i] = tokenizer.nextToken().trim();
249       } else {
250         list[i] = tokenizer.nextToken();
251       }
252     }
253     return list;
254   }
255 
256   /**
257    * Trim characters from only the beginning of a string.
258    * This is a convenience method, it simply calls trimStart(s, null).
259    *
260    * @param s String to be trimmed
261    * @return String with whitespace characters removed from the beginning
262    */
trimStart(String s)263   public static String trimStart(String s) {
264     return trimStart(s, null);
265   }
266 
267   /**
268    * Trim characters from only the beginning of a string.
269    * This method will remove all whitespace characters
270    * (defined by Character.isWhitespace(char), in addition to the characters
271    * provided, from the end of the provided string.
272    *
273    * @param s String to be trimmed
274    * @param extraChars Characters in addition to whitespace characters that
275    *                   should be trimmed.  May be null.
276    * @return String with whitespace and characters in extraChars removed
277    *                   from the beginning
278    */
trimStart(String s, String extraChars)279   public static String trimStart(String s, String extraChars) {
280     int trimCount = 0;
281     while (trimCount < s.length()) {
282       char ch = s.charAt(trimCount);
283       if (Character.isWhitespace(ch)
284         || (extraChars != null && extraChars.indexOf(ch) >= 0)) {
285         trimCount++;
286       } else {
287         break;
288       }
289     }
290 
291     if (trimCount == 0) {
292       return s;
293     }
294     return s.substring(trimCount);
295   }
296 
297   /**
298    * Trim characters from only the end of a string.
299    * This is a convenience method, it simply calls trimEnd(s, null).
300    *
301    * @param s String to be trimmed
302    * @return String with whitespace characters removed from the end
303    */
trimEnd(String s)304   public static String trimEnd(String s) {
305     return trimEnd(s, null);
306   }
307 
308   /**
309    * Trim characters from only the end of a string.
310    * This method will remove all whitespace characters
311    * (defined by Character.isWhitespace(char), in addition to the characters
312    * provided, from the end of the provided string.
313    *
314    * @param s String to be trimmed
315    * @param extraChars Characters in addition to whitespace characters that
316    *                   should be trimmed.  May be null.
317    * @return String with whitespace and characters in extraChars removed
318    *                   from the end
319    */
trimEnd(String s, String extraChars)320   public static String trimEnd(String s, String extraChars) {
321     int trimCount = 0;
322     while (trimCount < s.length()) {
323       char ch = s.charAt(s.length() - trimCount - 1);
324       if (Character.isWhitespace(ch)
325         || (extraChars != null && extraChars.indexOf(ch) >= 0)) {
326         trimCount++;
327       } else {
328         break;
329       }
330     }
331 
332     if (trimCount == 0) {
333       return s;
334     }
335     return s.substring(0, s.length() - trimCount);
336   }
337 
338   /**
339    * @param str the string to split.  Must not be null.
340    * @param delims the delimiter characters. Each character in the
341    *        string is individually treated as a delimiter.
342    * @return an array of tokens. Will not return null. Leading/trailing
343    *        whitespace is removed from the tokens.
344    * @deprecated see the detailed instructions under
345    *     {@link #split(String, String, boolean)}
346    */
347   @Deprecated
splitAndTrim(String str, String delims)348   public static String[] splitAndTrim(String str, String delims) {
349     return split(str, delims, true);
350   }
351 
352   /** Parse comma-separated list of ints and return as array. */
splitInts(String str)353   public static int[] splitInts(String str) throws IllegalArgumentException {
354     StringTokenizer tokenizer = new StringTokenizer(str, ",");
355     int n = tokenizer.countTokens();
356     int[] list = new int[n];
357     for (int i = 0; i < n; i++) {
358       String token = tokenizer.nextToken();
359       list[i] = Integer.parseInt(token);
360     }
361     return list;
362   }
363 
364   /** Parse comma-separated list of longs and return as array. */
splitLongs(String str)365   public static long[] splitLongs(String str) throws IllegalArgumentException {
366     StringTokenizer tokenizer = new StringTokenizer(str, ",");
367     int n = tokenizer.countTokens();
368     long[] list = new long[n];
369     for (int i = 0; i < n; i++) {
370       String token = tokenizer.nextToken();
371       list[i] = Long.parseLong(token);
372     }
373     return list;
374   }
375 
376   /** This replaces the occurrences of 'what' in 'str' with 'with'
377    *
378    * @param str the string to process
379    * @param what to replace
380    * @param with replace with this
381    * @return String str where 'what' was replaced with 'with'
382    *
383    * @deprecated Please use {@link String#replace(CharSequence, CharSequence)}.
384    */
385   @Deprecated
replace( String str, CharSequence what, CharSequence with)386   public static String replace(
387       String str, CharSequence what, CharSequence with) {
388     // Have to check this argument, for compatibility with the old impl.
389     // For the record, String.replace() is capable of handling an empty target
390     // string... but it does something kind of weird in that case.
391     checkArgument(what.length() > 0);
392     return str.replace(what, with);
393   }
394 
395   private static final Splitter NEWLINE_SPLITTER =
396       Splitter.on('\n').omitEmptyStrings();
397 
398   /**
399    * Reformats the given string to a fixed width by inserting carriage returns
400    * and trimming unnecessary whitespace. See
401    * {@link #fixedWidth(String[], int)} for details. The {@code str} argument
402    * to this method will be split on newline characters ({@code '\n'}) only
403    * (regardless of platform).  An array of resulting non-empty strings is
404    * then passed to {@link #fixedWidth(String[], int)} as the {@code lines}
405    * parameter.
406    *
407    * @param str the string to format
408    * @param width the fixed width (in characters)
409    */
fixedWidth(String str, int width)410   public static String fixedWidth(String str, int width) {
411     List<String> lines = new ArrayList<String>();
412 
413     for (String line : NEWLINE_SPLITTER.split(str)) {
414       lines.add(line);
415     }
416 
417     String[] lineArray = lines.toArray(new String[0]);
418     return fixedWidth(lineArray, width);
419   }
420 
421   /**
422    * Reformats the given array of lines to a fixed width by inserting
423    * newlines and trimming unnecessary whitespace.  This uses simple
424    * whitespace-based splitting, not sophisticated internationalized
425    * line breaking.  Newlines within a line are treated like any other
426    * whitespace.  Lines which are already short enough will be passed
427    * through unmodified.
428    *
429    * <p>Only breaking whitespace characters (those which match
430    * {@link CharMatcher#BREAKING_WHITESPACE}) are treated as whitespace by
431    * this method. Non-breaking whitespace characters will be considered as
432    * ordinary characters which are connected to any other adjacent
433    * non-whitespace characters, and will therefore appear in the returned
434    * string in their original context.
435    *
436    * @param lines array of lines to format
437    * @param width the fixed width (in characters)
438    */
fixedWidth(String[] lines, int width)439   public static String fixedWidth(String[] lines, int width) {
440     List<String> formattedLines = new ArrayList<String>();
441 
442     for (String line : lines) {
443       formattedLines.add(formatLineToFixedWidth(line, width));
444     }
445 
446     return Joiner.on('\n').join(formattedLines);
447   }
448 
449   private static final Splitter TO_WORDS =
450       Splitter.on(CharMatcher.BREAKING_WHITESPACE).omitEmptyStrings();
451 
452   /**
453    * Helper method for {@link #fixedWidth(String[], int)}
454    */
formatLineToFixedWidth(String line, int width)455   private static String formatLineToFixedWidth(String line, int width) {
456     if (line.length() <= width) {
457       return line;
458     }
459 
460     StringBuilder builder = new StringBuilder();
461     int col = 0;
462 
463     for (String word : TO_WORDS.split(line)) {
464       if (col == 0) {
465         col = word.length();
466       } else {
467         int newCol = col + word.length() + 1;  // +1 for the space
468 
469         if (newCol <= width) {
470           builder.append(' ');
471           col = newCol;
472         } else {
473           builder.append('\n');
474           col = word.length();
475         }
476       }
477 
478       builder.append(word);
479     }
480 
481     return builder.toString();
482   }
483 
484   /**
485    * Splits the argument original into a list of substrings.  All the
486    * substrings in the returned list (except possibly the last) will
487    * have length lineLen.
488    *
489    * @param lineLen  the length of the substrings to put in the list
490    * @param original the original string
491    *
492    * @return a list of strings of length lineLen that together make up the
493    *     original string
494    * @deprecated use {@code Splitter.fixedLength(lineLen).split(original))}
495    *     (note that it returns an {@code Iterable}, not a {@code List})
496    */
497   @Deprecated
fixedSplit(String original, int lineLen)498   public static List<String> fixedSplit(String original, int lineLen) {
499     List<String> output = new ArrayList<String>();
500     for (String elem : Splitter.fixedLength(lineLen).split(original)) {
501       output.add(elem);
502     }
503     return output;
504   }
505 
506   /**
507    * Indents the given String per line.
508    * @param iString the string to indent
509    * @param iIndentDepth the depth of the indentation
510    * @return the indented string
511    */
indent(String iString, int iIndentDepth)512   public static String indent(String iString, int iIndentDepth) {
513     StringBuilder spacer = new StringBuilder();
514     spacer.append("\n");
515     for (int i = 0; i < iIndentDepth; i++) {
516       spacer.append("  ");
517     }
518     return iString.replace("\n", spacer.toString());
519   }
520 
521   /**
522    * This is a both way strip.
523    *
524    * @param str the string to strip
525    * @param left strip from left
526    * @param right strip from right
527    * @param what character(s) to strip
528    * @return the stripped string
529    * @deprecated ensure the string is not null and use
530    *  <ul>
531    *    <li> {@code CharMatcher.anyOf(what).trimFrom(str)}
532    *        if {@code left == true} and {@code right == true}
533    *    <li> {@code CharMatcher.anyOf(what).trimLeadingFrom(str)}
534    *        if {@code left == true} and {@code right == false}
535    *    <li> {@code CharMatcher.anyOf(what).trimTrailingFrom(str)}
536    *        if {@code left == false} and {@code right == true}
537    *  </ul>
538    */
539   @Deprecated
megastrip(String str, boolean left, boolean right, String what)540   public static String megastrip(String str,
541                                  boolean left, boolean right,
542                                  String what) {
543     if (str == null) {
544       return null;
545     }
546 
547     CharMatcher matcher = CharMatcher.anyOf(what);
548     if (left) {
549       if (right) {
550         return matcher.trimFrom(str);
551       }
552       return matcher.trimLeadingFrom(str);
553     }
554     if (right) {
555       return matcher.trimTrailingFrom(str);
556     }
557     return str;
558   }
559 
560   /** strip - strips both ways
561    *
562    * @param str what to strip
563    * @return String the striped string
564    * @deprecated ensure the string is not null and use {@code
565    *     CharMatcher.LEGACY_WHITESPACE.trimFrom(str)}; also consider whether you
566    *     really want the legacy whitespace definition, or something more
567    *     standard like {@link CharMatcher#WHITESPACE}.
568    */
569   @SuppressWarnings("deprecation") // this is deprecated itself
strip(String str)570   @Deprecated public static String strip(String str) {
571     return (str == null) ? null : CharMatcher.LEGACY_WHITESPACE.trimFrom(str);
572   }
573 
574   /** Strip white spaces from both end, and collapse white spaces
575    * in the middle.
576    *
577    * @param str what to strip
578    * @return String the striped and collapsed string
579    * @deprecated ensure the string is not null and use {@code
580    *     CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ')}; also
581    *     consider whether you really want the legacy whitespace definition, or
582    *     something more standard like {@link CharMatcher#WHITESPACE}.
583    */
584   @SuppressWarnings("deprecation") // this is deprecated itself
stripAndCollapse(String str)585   @Deprecated public static String stripAndCollapse(String str) {
586     return (str == null) ? null
587         : CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ');
588   }
589 
590   /**
591    * Give me a string and a potential prefix, and I return the string
592    * following the prefix if the prefix matches, else null.
593    * Analogous to the c++ functions strprefix and var_strprefix.
594    *
595    * @param str the string to strip
596    * @param prefix the expected prefix
597    * @return the stripped string or <code>null</code> if the string
598    * does not start with the prefix
599    */
stripPrefix(String str, String prefix)600   public static String stripPrefix(String str, String prefix) {
601     return str.startsWith(prefix)
602         ? str.substring(prefix.length())
603         : null;
604   }
605 
606   /**
607    * Case insensitive version of stripPrefix. Strings are compared in
608    * the same way as in {@link String#equalsIgnoreCase}.
609    * Analogous to the c++ functions strcaseprefix and var_strcaseprefix.
610    *
611    * @param str the string to strip
612    * @param prefix the expected prefix
613    * @return the stripped string or <code>null</code> if the string
614    * does not start with the prefix
615    */
stripPrefixIgnoreCase(String str, String prefix)616   public static String stripPrefixIgnoreCase(String str, String prefix) {
617     return startsWithIgnoreCase(str, prefix)
618         ? str.substring(prefix.length())
619         : null;
620   }
621 
622   /**
623    * Give me a string and a potential suffix, and I return the string
624    * before the suffix if the suffix matches, else null.
625    * Analogous to the c++ function strsuffix.
626    *
627    * @param str the string to strip
628    * @param suffix the expected suffix
629    * @return the stripped string or <code>null</code> if the string
630    * does not end with the suffix
631    */
stripSuffix(String str, String suffix)632   public static String stripSuffix(String str, String suffix) {
633     return str.endsWith(suffix)
634         ? str.substring(0, str.length() - suffix.length())
635         : null;
636   }
637 
638   /**
639    * Case insensitive version of stripSuffix. Strings are compared in
640    * the same way as in {@link String#equalsIgnoreCase}.
641    * Analogous to the c++ function strcasesuffix.
642    *
643    * @param str the string to strip
644    * @param suffix the expected suffix
645    * @return the stripped string or <code>null</code> if the string
646    * does not end with the suffix
647    */
stripSuffixIgnoreCase( String str, String suffix)648   public static String stripSuffixIgnoreCase(
649       String str, String suffix) {
650     return endsWithIgnoreCase(str, suffix)
651         ? str.substring(0, str.length() - suffix.length())
652         : null;
653   }
654 
655   /**
656    * Strips all non-digit characters from a string.
657    *
658    * The resulting string will only contain characters for which isDigit()
659    * returns true.
660    *
661    * @param str the string to strip
662    * @return a string consisting of digits only, or an empty string
663    * @deprecated use {@code CharMatcher.JAVA_DIGIT.retainFrom(str)} (also
664    *     consider whether this is really the definition of "digit" you wish to
665    *     use)
666    */
stripNonDigits(String str)667   @Deprecated public static String stripNonDigits(String str) {
668     return CharMatcher.JAVA_DIGIT.retainFrom(str);
669   }
670 
671   /**
672    * Finds the last index in str of a character not in the characters
673    * in 'chars' (similar to ANSI string.find_last_not_of).
674    *
675    * Returns -1 if no such character can be found.
676    *
677    * <p><b>Note:</b> If {@code fromIndex} is zero, use {@link CharMatcher}
678    * instead for this: {@code CharMatcher.noneOf(chars).lastIndexIn(str)}.
679    */
680   // TODO(kevinb): after adding fromIndex versions of (last)IndexOf to
681   // CharMatcher, deprecate this
lastIndexNotOf(String str, String chars, int fromIndex)682   public static int lastIndexNotOf(String str, String chars, int fromIndex) {
683     fromIndex = Math.min(fromIndex, str.length() - 1);
684 
685     for (int pos = fromIndex; pos >= 0; pos--) {
686       if (chars.indexOf(str.charAt(pos)) < 0) {
687         return pos;
688       }
689     }
690 
691     return -1;
692   }
693 
694   /**
695    * Like String.replace() except that it accepts any number of old chars.
696    * Replaces any occurrances of 'oldchars' in 'str' with 'newchar'.
697    * Example: replaceChars("Hello, world!", "H,!", ' ') returns " ello  world "
698    *
699    * @deprecated use {@code CharMatcher#replaceFrom(String, char)}, for example
700    *     {@code CharMatcher.anyOf(oldchars).replaceFrom(str, newchar)}
701    */
replaceChars( String str, CharSequence oldchars, char newchar)702   @Deprecated public static String replaceChars(
703       String str, CharSequence oldchars, char newchar) {
704     return CharMatcher.anyOf(oldchars).replaceFrom(str, newchar);
705   }
706 
707   /**
708    * Remove any occurrances of 'oldchars' in 'str'.
709    * Example: removeChars("Hello, world!", ",!") returns "Hello world"
710    *
711    * @deprecated use {@link CharMatcher#removeFrom(CharSequence)}, for example
712    *     {@code CharMatcher.anyOf(oldchars).removeFrom(str)}
713    */
removeChars( String str, CharSequence oldchars)714   @Deprecated public static String removeChars(
715       String str, CharSequence oldchars) {
716     return CharMatcher.anyOf(oldchars).removeFrom(str);
717   }
718 
719   // See http://www.microsoft.com/typography/unicode/1252.htm
720   private static final CharMatcher FANCY_SINGLE_QUOTE
721       = CharMatcher.anyOf("\u0091\u0092\u2018\u2019");
722   private static final CharMatcher FANCY_DOUBLE_QUOTE
723       = CharMatcher.anyOf("\u0093\u0094\u201c\u201d");
724 
725   /**
726    * Replaces microsoft "smart quotes" (curly " and ') with their
727    * ascii counterparts.
728    */
replaceSmartQuotes(String str)729   public static String replaceSmartQuotes(String str) {
730     String tmp = FANCY_SINGLE_QUOTE.replaceFrom(str, '\'');
731     return FANCY_DOUBLE_QUOTE.replaceFrom(tmp, '"');
732   }
733 
734   /**
735    * Convert a string of hex digits to a byte array, with the first
736    * byte in the array being the MSB. The string passed in should be
737    * just the raw digits (upper or lower case), with no leading
738    * or trailing characters (like '0x' or 'h').
739    * An odd number of characters is supported.
740    * If the string is empty, an empty array will be returned.
741    *
742    * This is significantly faster than using
743    *   new BigInteger(str, 16).toByteArray();
744    * especially with larger strings. Here are the results of some
745    * microbenchmarks done on a P4 2.8GHz 2GB RAM running
746    * linux 2.4.22-gg11 and JDK 1.5 with an optimized build:
747    *
748    * String length        hexToBytes (usec)   BigInteger
749    * -----------------------------------------------------
750    * 16                       0.570                 1.43
751    * 256                      8.21                 44.4
752    * 1024                    32.8                 526
753    * 16384                  546                121000
754    */
hexToBytes(CharSequence str)755   public static byte[] hexToBytes(CharSequence str) {
756     byte[] bytes = new byte[(str.length() + 1) / 2];
757     if (str.length() == 0) {
758       return bytes;
759     }
760     bytes[0] = 0;
761     int nibbleIdx = (str.length() % 2);
762     for (int i = 0; i < str.length(); i++) {
763       char c = str.charAt(i);
764       if (!isHex(c)) {
765         throw new IllegalArgumentException("string contains non-hex chars");
766       }
767       if ((nibbleIdx % 2) == 0) {
768         bytes[nibbleIdx >> 1] = (byte) (hexValue(c) << 4);
769       } else {
770         bytes[nibbleIdx >> 1] += (byte) hexValue(c);
771       }
772       nibbleIdx++;
773     }
774     return bytes;
775   }
776 
777   /**
778    * Converts any instances of "\r" or "\r\n" style EOLs into "\n" (Line Feed).
779    */
convertEOLToLF(String input)780   public static String convertEOLToLF(String input) {
781     StringBuilder res = new StringBuilder(input.length());
782     char[] s = input.toCharArray();
783     int from = 0;
784     final int end = s.length;
785     for (int i = 0; i < end; i++) {
786       if (s[i] == '\r') {
787         res.append(s, from, i - from);
788         res.append('\n');
789         if (i + 1 < end && s[i + 1] == '\n') {
790           i++;
791         }
792 
793         from = i + 1;
794       }
795     }
796 
797     if (from == 0) {   // no \r!
798       return input;
799     }
800 
801     res.append(s, from, end - from);
802     return res.toString();
803   }
804 
805   /**
806    * Old location of {@link Strings#padStart}; this method will be deprecated
807    * soon.
808    */
padLeft(String s, int len, char padChar)809   public static String padLeft(String s, int len, char padChar) {
810     return Strings.padStart(s, len, padChar);
811   }
812 
813   /**
814    * Old location of {@link Strings#padEnd}; this method will be deprecated
815    * soon.
816    */
padRight(String s, int len, char padChar)817   public static String padRight(String s, int len, char padChar) {
818     return Strings.padEnd(s, len, padChar);
819   }
820 
821   /**
822    * Returns a string consisting of "s", with each of the first "len" characters
823    * replaced by "maskChar" character.
824    */
maskLeft(String s, int len, char maskChar)825   public static String maskLeft(String s, int len, char maskChar) {
826     if (len <= 0) {
827       return s;
828     }
829     len = Math.min(len, s.length());
830     StringBuilder sb = new StringBuilder();
831     for (int i = 0; i < len; i++) {
832       sb.append(maskChar);
833     }
834     sb.append(s.substring(len));
835     return sb.toString();
836   }
837 
isOctal(char c)838   private static boolean isOctal(char c) {
839     return (c >= '0') && (c <= '7');
840   }
841 
isHex(char c)842   private static boolean isHex(char c) {
843     return ((c >= '0') && (c <= '9')) ||
844            ((c >= 'a') && (c <= 'f')) ||
845            ((c >= 'A') && (c <= 'F'));
846   }
847 
hexValue(char c)848   private static int hexValue(char c) {
849     if ((c >= '0') && (c <= '9')) {
850       return (c - '0');
851     } else if ((c >= 'a') && (c <= 'f')) {
852       return (c - 'a') + 10;
853     } else {
854       return (c - 'A') + 10;
855     }
856   }
857 
858   /**
859    * Unescape any C escape sequences (\n, \r, \\, \ooo, etc) and return the
860    * resulting string.
861    */
unescapeCString(String s)862   public static String unescapeCString(String s) {
863     if (s.indexOf('\\') < 0) {
864       // Fast path: nothing to unescape
865       return s;
866     }
867 
868     StringBuilder sb = new StringBuilder();
869     int len = s.length();
870     for (int i = 0; i < len;) {
871       char c = s.charAt(i++);
872       if (c == '\\' && (i < len)) {
873         c = s.charAt(i++);
874         switch (c) {
875           case 'a':  c = '\007';  break;
876           case 'b':  c = '\b';    break;
877           case 'f':  c = '\f';    break;
878           case 'n':  c = '\n';    break;
879           case 'r':  c = '\r';    break;
880           case 't':  c = '\t';    break;
881           case 'v':  c = '\013';  break;
882           case '\\': c = '\\';    break;
883           case '?':  c = '?';     break;
884           case '\'': c = '\'';    break;
885           case '"':  c = '\"';    break;
886 
887           default: {
888             if ((c == 'x') && (i < len) && isHex(s.charAt(i))) {
889               // "\xXX"
890               int v = hexValue(s.charAt(i++));
891               if ((i < len) && isHex(s.charAt(i))) {
892                 v = v * 16 + hexValue(s.charAt(i++));
893               }
894               c = (char) v;
895             } else if (isOctal(c)) {
896               // "\OOO"
897               int v = (c - '0');
898               if ((i < len) && isOctal(s.charAt(i))) {
899                 v = v * 8 + (s.charAt(i++) - '0');
900               }
901               if ((i < len) && isOctal(s.charAt(i))) {
902                 v = v * 8 + (s.charAt(i++) - '0');
903               }
904               c = (char) v;
905             } else {
906               // Propagate unknown escape sequences.
907               sb.append('\\');
908             }
909             break;
910           }
911         }
912       }
913       sb.append(c);
914     }
915     return sb.toString();
916   }
917 
918   /**
919    * Unescape any MySQL escape sequences.
920    * See MySQL language reference Chapter 6 at
921    * <a href="http://www.mysql.com/doc/">http://www.mysql.com/doc/</a>.
922    * This function will <strong>not</strong> work for other SQL-like
923    * dialects.
924    * @param s string to unescape, with the surrounding quotes.
925    * @return unescaped string, without the surrounding quotes.
926    * @exception IllegalArgumentException if s is not a valid MySQL string.
927    */
unescapeMySQLString(String s)928   public static String unescapeMySQLString(String s)
929       throws IllegalArgumentException {
930     // note: the same buffer is used for both reading and writing
931     // it works because the writer can never outrun the reader
932     char chars[] = s.toCharArray();
933 
934     // the string must be quoted 'like this' or "like this"
935     if (chars.length < 2 || chars[0] != chars[chars.length - 1] ||
936         (chars[0] != '\'' && chars[0] != '"')) {
937       throw new IllegalArgumentException("not a valid MySQL string: " + s);
938     }
939 
940     // parse the string and decode the backslash sequences; in addition,
941     // quotes can be escaped 'like this: ''', "like this: """, or 'like this: "'
942     int j = 1;  // write position in the string (never exceeds read position)
943     int f = 0;  // state: 0 (normal), 1 (backslash), 2 (quote)
944     for (int i = 1; i < chars.length - 1; i++) {
945       if (f == 0) {             // previous character was normal
946         if (chars[i] == '\\') {
947           f = 1;  // backslash
948         } else if (chars[i] == chars[0]) {
949           f = 2;  // quoting character
950         } else {
951           chars[j++] = chars[i];
952         }
953       } else if (f == 1) {      // previous character was a backslash
954         switch (chars[i]) {
955           case '0':   chars[j++] = '\0';   break;
956           case '\'':  chars[j++] = '\'';   break;
957           case '"':   chars[j++] = '"';    break;
958           case 'b':   chars[j++] = '\b';   break;
959           case 'n':   chars[j++] = '\n';   break;
960           case 'r':   chars[j++] = '\r';   break;
961           case 't':   chars[j++] = '\t';   break;
962           case 'z':   chars[j++] = '\032'; break;
963           case '\\':  chars[j++] = '\\';   break;
964           default:
965             // if the character is not special, backslash disappears
966             chars[j++] = chars[i];
967             break;
968         }
969         f = 0;
970       } else {                  // previous character was a quote
971         // quoting characters must be doubled inside a string
972         if (chars[i] != chars[0]) {
973           throw new IllegalArgumentException("not a valid MySQL string: " + s);
974         }
975         chars[j++] = chars[0];
976         f = 0;
977       }
978     }
979     // string contents cannot end with a special character
980     if (f != 0) {
981       throw new IllegalArgumentException("not a valid MySQL string: " + s);
982     }
983 
984     // done
985     return new String(chars, 1, j - 1);
986   }
987 
988   // TODO(pbarry): move all HTML methods to common.html package
989 
990   static final Map<String, Character> ESCAPE_STRINGS;
991   static final Set<Character> HEX_LETTERS;
992 
993   static {
994     // HTML character entity references as defined in HTML 4
995     // see http://www.w3.org/TR/REC-html40/sgml/entities.html
996     ESCAPE_STRINGS = new HashMap<String, Character>(252);
997 
998     ESCAPE_STRINGS.put("&nbsp", '\u00A0');
999     ESCAPE_STRINGS.put("&iexcl", '\u00A1');
1000     ESCAPE_STRINGS.put("&cent", '\u00A2');
1001     ESCAPE_STRINGS.put("&pound", '\u00A3');
1002     ESCAPE_STRINGS.put("&curren", '\u00A4');
1003     ESCAPE_STRINGS.put("&yen", '\u00A5');
1004     ESCAPE_STRINGS.put("&brvbar", '\u00A6');
1005     ESCAPE_STRINGS.put("&sect", '\u00A7');
1006     ESCAPE_STRINGS.put("&uml", '\u00A8');
1007     ESCAPE_STRINGS.put("&copy", '\u00A9');
1008     ESCAPE_STRINGS.put("&ordf", '\u00AA');
1009     ESCAPE_STRINGS.put("&laquo", '\u00AB');
1010     ESCAPE_STRINGS.put("&not", '\u00AC');
1011     ESCAPE_STRINGS.put("&shy", '\u00AD');
1012     ESCAPE_STRINGS.put("&reg", '\u00AE');
1013     ESCAPE_STRINGS.put("&macr", '\u00AF');
1014     ESCAPE_STRINGS.put("&deg", '\u00B0');
1015     ESCAPE_STRINGS.put("&plusmn", '\u00B1');
1016     ESCAPE_STRINGS.put("&sup2", '\u00B2');
1017     ESCAPE_STRINGS.put("&sup3", '\u00B3');
1018     ESCAPE_STRINGS.put("&acute", '\u00B4');
1019     ESCAPE_STRINGS.put("&micro", '\u00B5');
1020     ESCAPE_STRINGS.put("&para", '\u00B6');
1021     ESCAPE_STRINGS.put("&middot", '\u00B7');
1022     ESCAPE_STRINGS.put("&cedil", '\u00B8');
1023     ESCAPE_STRINGS.put("&sup1", '\u00B9');
1024     ESCAPE_STRINGS.put("&ordm", '\u00BA');
1025     ESCAPE_STRINGS.put("&raquo", '\u00BB');
1026     ESCAPE_STRINGS.put("&frac14", '\u00BC');
1027     ESCAPE_STRINGS.put("&frac12", '\u00BD');
1028     ESCAPE_STRINGS.put("&frac34", '\u00BE');
1029     ESCAPE_STRINGS.put("&iquest", '\u00BF');
1030     ESCAPE_STRINGS.put("&Agrave", '\u00C0');
1031     ESCAPE_STRINGS.put("&Aacute", '\u00C1');
1032     ESCAPE_STRINGS.put("&Acirc", '\u00C2');
1033     ESCAPE_STRINGS.put("&Atilde", '\u00C3');
1034     ESCAPE_STRINGS.put("&Auml", '\u00C4');
1035     ESCAPE_STRINGS.put("&Aring", '\u00C5');
1036     ESCAPE_STRINGS.put("&AElig", '\u00C6');
1037     ESCAPE_STRINGS.put("&Ccedil", '\u00C7');
1038     ESCAPE_STRINGS.put("&Egrave", '\u00C8');
1039     ESCAPE_STRINGS.put("&Eacute", '\u00C9');
1040     ESCAPE_STRINGS.put("&Ecirc", '\u00CA');
1041     ESCAPE_STRINGS.put("&Euml", '\u00CB');
1042     ESCAPE_STRINGS.put("&Igrave", '\u00CC');
1043     ESCAPE_STRINGS.put("&Iacute", '\u00CD');
1044     ESCAPE_STRINGS.put("&Icirc", '\u00CE');
1045     ESCAPE_STRINGS.put("&Iuml", '\u00CF');
1046     ESCAPE_STRINGS.put("&ETH", '\u00D0');
1047     ESCAPE_STRINGS.put("&Ntilde", '\u00D1');
1048     ESCAPE_STRINGS.put("&Ograve", '\u00D2');
1049     ESCAPE_STRINGS.put("&Oacute", '\u00D3');
1050     ESCAPE_STRINGS.put("&Ocirc", '\u00D4');
1051     ESCAPE_STRINGS.put("&Otilde", '\u00D5');
1052     ESCAPE_STRINGS.put("&Ouml", '\u00D6');
1053     ESCAPE_STRINGS.put("&times", '\u00D7');
1054     ESCAPE_STRINGS.put("&Oslash", '\u00D8');
1055     ESCAPE_STRINGS.put("&Ugrave", '\u00D9');
1056     ESCAPE_STRINGS.put("&Uacute", '\u00DA');
1057     ESCAPE_STRINGS.put("&Ucirc", '\u00DB');
1058     ESCAPE_STRINGS.put("&Uuml", '\u00DC');
1059     ESCAPE_STRINGS.put("&Yacute", '\u00DD');
1060     ESCAPE_STRINGS.put("&THORN", '\u00DE');
1061     ESCAPE_STRINGS.put("&szlig", '\u00DF');
1062     ESCAPE_STRINGS.put("&agrave", '\u00E0');
1063     ESCAPE_STRINGS.put("&aacute", '\u00E1');
1064     ESCAPE_STRINGS.put("&acirc", '\u00E2');
1065     ESCAPE_STRINGS.put("&atilde", '\u00E3');
1066     ESCAPE_STRINGS.put("&auml", '\u00E4');
1067     ESCAPE_STRINGS.put("&aring", '\u00E5');
1068     ESCAPE_STRINGS.put("&aelig", '\u00E6');
1069     ESCAPE_STRINGS.put("&ccedil", '\u00E7');
1070     ESCAPE_STRINGS.put("&egrave", '\u00E8');
1071     ESCAPE_STRINGS.put("&eacute", '\u00E9');
1072     ESCAPE_STRINGS.put("&ecirc", '\u00EA');
1073     ESCAPE_STRINGS.put("&euml", '\u00EB');
1074     ESCAPE_STRINGS.put("&igrave", '\u00EC');
1075     ESCAPE_STRINGS.put("&iacute", '\u00ED');
1076     ESCAPE_STRINGS.put("&icirc", '\u00EE');
1077     ESCAPE_STRINGS.put("&iuml", '\u00EF');
1078     ESCAPE_STRINGS.put("&eth", '\u00F0');
1079     ESCAPE_STRINGS.put("&ntilde", '\u00F1');
1080     ESCAPE_STRINGS.put("&ograve", '\u00F2');
1081     ESCAPE_STRINGS.put("&oacute", '\u00F3');
1082     ESCAPE_STRINGS.put("&ocirc", '\u00F4');
1083     ESCAPE_STRINGS.put("&otilde", '\u00F5');
1084     ESCAPE_STRINGS.put("&ouml", '\u00F6');
1085     ESCAPE_STRINGS.put("&divide", '\u00F7');
1086     ESCAPE_STRINGS.put("&oslash", '\u00F8');
1087     ESCAPE_STRINGS.put("&ugrave", '\u00F9');
1088     ESCAPE_STRINGS.put("&uacute", '\u00FA');
1089     ESCAPE_STRINGS.put("&ucirc", '\u00FB');
1090     ESCAPE_STRINGS.put("&uuml", '\u00FC');
1091     ESCAPE_STRINGS.put("&yacute", '\u00FD');
1092     ESCAPE_STRINGS.put("&thorn", '\u00FE');
1093     ESCAPE_STRINGS.put("&yuml", '\u00FF');
1094     ESCAPE_STRINGS.put("&fnof", '\u0192');
1095     ESCAPE_STRINGS.put("&Alpha", '\u0391');
1096     ESCAPE_STRINGS.put("&Beta", '\u0392');
1097     ESCAPE_STRINGS.put("&Gamma", '\u0393');
1098     ESCAPE_STRINGS.put("&Delta", '\u0394');
1099     ESCAPE_STRINGS.put("&Epsilon", '\u0395');
1100     ESCAPE_STRINGS.put("&Zeta", '\u0396');
1101     ESCAPE_STRINGS.put("&Eta", '\u0397');
1102     ESCAPE_STRINGS.put("&Theta", '\u0398');
1103     ESCAPE_STRINGS.put("&Iota", '\u0399');
1104     ESCAPE_STRINGS.put("&Kappa", '\u039A');
1105     ESCAPE_STRINGS.put("&Lambda", '\u039B');
1106     ESCAPE_STRINGS.put("&Mu", '\u039C');
1107     ESCAPE_STRINGS.put("&Nu", '\u039D');
1108     ESCAPE_STRINGS.put("&Xi", '\u039E');
1109     ESCAPE_STRINGS.put("&Omicron", '\u039F');
1110     ESCAPE_STRINGS.put("&Pi", '\u03A0');
1111     ESCAPE_STRINGS.put("&Rho", '\u03A1');
1112     ESCAPE_STRINGS.put("&Sigma", '\u03A3');
1113     ESCAPE_STRINGS.put("&Tau", '\u03A4');
1114     ESCAPE_STRINGS.put("&Upsilon", '\u03A5');
1115     ESCAPE_STRINGS.put("&Phi", '\u03A6');
1116     ESCAPE_STRINGS.put("&Chi", '\u03A7');
1117     ESCAPE_STRINGS.put("&Psi", '\u03A8');
1118     ESCAPE_STRINGS.put("&Omega", '\u03A9');
1119     ESCAPE_STRINGS.put("&alpha", '\u03B1');
1120     ESCAPE_STRINGS.put("&beta", '\u03B2');
1121     ESCAPE_STRINGS.put("&gamma", '\u03B3');
1122     ESCAPE_STRINGS.put("&delta", '\u03B4');
1123     ESCAPE_STRINGS.put("&epsilon", '\u03B5');
1124     ESCAPE_STRINGS.put("&zeta", '\u03B6');
1125     ESCAPE_STRINGS.put("&eta", '\u03B7');
1126     ESCAPE_STRINGS.put("&theta", '\u03B8');
1127     ESCAPE_STRINGS.put("&iota", '\u03B9');
1128     ESCAPE_STRINGS.put("&kappa", '\u03BA');
1129     ESCAPE_STRINGS.put("&lambda", '\u03BB');
1130     ESCAPE_STRINGS.put("&mu", '\u03BC');
1131     ESCAPE_STRINGS.put("&nu", '\u03BD');
1132     ESCAPE_STRINGS.put("&xi", '\u03BE');
1133     ESCAPE_STRINGS.put("&omicron", '\u03BF');
1134     ESCAPE_STRINGS.put("&pi", '\u03C0');
1135     ESCAPE_STRINGS.put("&rho", '\u03C1');
1136     ESCAPE_STRINGS.put("&sigmaf", '\u03C2');
1137     ESCAPE_STRINGS.put("&sigma", '\u03C3');
1138     ESCAPE_STRINGS.put("&tau", '\u03C4');
1139     ESCAPE_STRINGS.put("&upsilon", '\u03C5');
1140     ESCAPE_STRINGS.put("&phi", '\u03C6');
1141     ESCAPE_STRINGS.put("&chi", '\u03C7');
1142     ESCAPE_STRINGS.put("&psi", '\u03C8');
1143     ESCAPE_STRINGS.put("&omega", '\u03C9');
1144     ESCAPE_STRINGS.put("&thetasym", '\u03D1');
1145     ESCAPE_STRINGS.put("&upsih", '\u03D2');
1146     ESCAPE_STRINGS.put("&piv", '\u03D6');
1147     ESCAPE_STRINGS.put("&bull", '\u2022');
1148     ESCAPE_STRINGS.put("&hellip", '\u2026');
1149     ESCAPE_STRINGS.put("&prime", '\u2032');
1150     ESCAPE_STRINGS.put("&Prime", '\u2033');
1151     ESCAPE_STRINGS.put("&oline", '\u203E');
1152     ESCAPE_STRINGS.put("&frasl", '\u2044');
1153     ESCAPE_STRINGS.put("&weierp", '\u2118');
1154     ESCAPE_STRINGS.put("&image", '\u2111');
1155     ESCAPE_STRINGS.put("&real", '\u211C');
1156     ESCAPE_STRINGS.put("&trade", '\u2122');
1157     ESCAPE_STRINGS.put("&alefsym", '\u2135');
1158     ESCAPE_STRINGS.put("&larr", '\u2190');
1159     ESCAPE_STRINGS.put("&uarr", '\u2191');
1160     ESCAPE_STRINGS.put("&rarr", '\u2192');
1161     ESCAPE_STRINGS.put("&darr", '\u2193');
1162     ESCAPE_STRINGS.put("&harr", '\u2194');
1163     ESCAPE_STRINGS.put("&crarr", '\u21B5');
1164     ESCAPE_STRINGS.put("&lArr", '\u21D0');
1165     ESCAPE_STRINGS.put("&uArr", '\u21D1');
1166     ESCAPE_STRINGS.put("&rArr", '\u21D2');
1167     ESCAPE_STRINGS.put("&dArr", '\u21D3');
1168     ESCAPE_STRINGS.put("&hArr", '\u21D4');
1169     ESCAPE_STRINGS.put("&forall", '\u2200');
1170     ESCAPE_STRINGS.put("&part", '\u2202');
1171     ESCAPE_STRINGS.put("&exist", '\u2203');
1172     ESCAPE_STRINGS.put("&empty", '\u2205');
1173     ESCAPE_STRINGS.put("&nabla", '\u2207');
1174     ESCAPE_STRINGS.put("&isin", '\u2208');
1175     ESCAPE_STRINGS.put("&notin", '\u2209');
1176     ESCAPE_STRINGS.put("&ni", '\u220B');
1177     ESCAPE_STRINGS.put("&prod", '\u220F');
1178     ESCAPE_STRINGS.put("&sum", '\u2211');
1179     ESCAPE_STRINGS.put("&minus", '\u2212');
1180     ESCAPE_STRINGS.put("&lowast", '\u2217');
1181     ESCAPE_STRINGS.put("&radic", '\u221A');
1182     ESCAPE_STRINGS.put("&prop", '\u221D');
1183     ESCAPE_STRINGS.put("&infin", '\u221E');
1184     ESCAPE_STRINGS.put("&ang", '\u2220');
1185     ESCAPE_STRINGS.put("&and", '\u2227');
1186     ESCAPE_STRINGS.put("&or", '\u2228');
1187     ESCAPE_STRINGS.put("&cap", '\u2229');
1188     ESCAPE_STRINGS.put("&cup", '\u222A');
1189     ESCAPE_STRINGS.put("&int", '\u222B');
1190     ESCAPE_STRINGS.put("&there4", '\u2234');
1191     ESCAPE_STRINGS.put("&sim", '\u223C');
1192     ESCAPE_STRINGS.put("&cong", '\u2245');
1193     ESCAPE_STRINGS.put("&asymp", '\u2248');
1194     ESCAPE_STRINGS.put("&ne", '\u2260');
1195     ESCAPE_STRINGS.put("&equiv", '\u2261');
1196     ESCAPE_STRINGS.put("&le", '\u2264');
1197     ESCAPE_STRINGS.put("&ge", '\u2265');
1198     ESCAPE_STRINGS.put("&sub", '\u2282');
1199     ESCAPE_STRINGS.put("&sup", '\u2283');
1200     ESCAPE_STRINGS.put("&nsub", '\u2284');
1201     ESCAPE_STRINGS.put("&sube", '\u2286');
1202     ESCAPE_STRINGS.put("&supe", '\u2287');
1203     ESCAPE_STRINGS.put("&oplus", '\u2295');
1204     ESCAPE_STRINGS.put("&otimes", '\u2297');
1205     ESCAPE_STRINGS.put("&perp", '\u22A5');
1206     ESCAPE_STRINGS.put("&sdot", '\u22C5');
1207     ESCAPE_STRINGS.put("&lceil", '\u2308');
1208     ESCAPE_STRINGS.put("&rceil", '\u2309');
1209     ESCAPE_STRINGS.put("&lfloor", '\u230A');
1210     ESCAPE_STRINGS.put("&rfloor", '\u230B');
1211     ESCAPE_STRINGS.put("&lang", '\u2329');
1212     ESCAPE_STRINGS.put("&rang", '\u232A');
1213     ESCAPE_STRINGS.put("&loz", '\u25CA');
1214     ESCAPE_STRINGS.put("&spades", '\u2660');
1215     ESCAPE_STRINGS.put("&clubs", '\u2663');
1216     ESCAPE_STRINGS.put("&hearts", '\u2665');
1217     ESCAPE_STRINGS.put("&diams", '\u2666');
1218     ESCAPE_STRINGS.put("&quot", '\u0022');
1219     ESCAPE_STRINGS.put("&amp", '\u0026');
1220     ESCAPE_STRINGS.put("&lt", '\u003C');
1221     ESCAPE_STRINGS.put("&gt", '\u003E');
1222     ESCAPE_STRINGS.put("&OElig", '\u0152');
1223     ESCAPE_STRINGS.put("&oelig", '\u0153');
1224     ESCAPE_STRINGS.put("&Scaron", '\u0160');
1225     ESCAPE_STRINGS.put("&scaron", '\u0161');
1226     ESCAPE_STRINGS.put("&Yuml", '\u0178');
1227     ESCAPE_STRINGS.put("&circ", '\u02C6');
1228     ESCAPE_STRINGS.put("&tilde", '\u02DC');
1229     ESCAPE_STRINGS.put("&ensp", '\u2002');
1230     ESCAPE_STRINGS.put("&emsp", '\u2003');
1231     ESCAPE_STRINGS.put("&thinsp", '\u2009');
1232     ESCAPE_STRINGS.put("&zwnj", '\u200C');
1233     ESCAPE_STRINGS.put("&zwj", '\u200D');
1234     ESCAPE_STRINGS.put("&lrm", '\u200E');
1235     ESCAPE_STRINGS.put("&rlm", '\u200F');
1236     ESCAPE_STRINGS.put("&ndash", '\u2013');
1237     ESCAPE_STRINGS.put("&mdash", '\u2014');
1238     ESCAPE_STRINGS.put("&lsquo", '\u2018');
1239     ESCAPE_STRINGS.put("&rsquo", '\u2019');
1240     ESCAPE_STRINGS.put("&sbquo", '\u201A');
1241     ESCAPE_STRINGS.put("&ldquo", '\u201C');
1242     ESCAPE_STRINGS.put("&rdquo", '\u201D');
1243     ESCAPE_STRINGS.put("&bdquo", '\u201E');
1244     ESCAPE_STRINGS.put("&dagger", '\u2020');
1245     ESCAPE_STRINGS.put("&Dagger", '\u2021');
1246     ESCAPE_STRINGS.put("&permil", '\u2030');
1247     ESCAPE_STRINGS.put("&lsaquo", '\u2039');
1248     ESCAPE_STRINGS.put("&rsaquo", '\u203A');
1249     ESCAPE_STRINGS.put("&euro", '\u20AC');
1250 
1251     HEX_LETTERS = new HashSet<Character>(12);
1252 
1253     HEX_LETTERS.add('a');
1254     HEX_LETTERS.add('A');
1255     HEX_LETTERS.add('b');
1256     HEX_LETTERS.add('B');
1257     HEX_LETTERS.add('c');
1258     HEX_LETTERS.add('C');
1259     HEX_LETTERS.add('d');
1260     HEX_LETTERS.add('D');
1261     HEX_LETTERS.add('e');
1262     HEX_LETTERS.add('E');
1263     HEX_LETTERS.add('f');
1264     HEX_LETTERS.add('F');
1265   }
1266 
1267   /**
1268    * <p>
1269    * Replace all the occurences of HTML escape strings with the
1270    * respective characters.
1271    * </p>
1272    * <p>
1273    * The default mode is strict (requiring semicolons).
1274    * </p>
1275    *
1276    * @param s a <code>String</code> value
1277    * @return a <code>String</code> value
1278    * @throws NullPointerException if the input string is null.
1279    */
unescapeHTML(String s)1280   public static final String unescapeHTML(String s) {
1281     return unescapeHTML(s, false);
1282   }
1283 
1284   /**
1285    * Replace all the occurences of HTML escape strings with the
1286    * respective characters.
1287    *
1288    * @param s a <code>String</code> value
1289    * @param emulateBrowsers a <code>Boolean</code> value that tells the method
1290    *     to allow entity refs not terminated with a semicolon to be unescaped.
1291    *     (a quirk of this feature, and some browsers, is that an explicit
1292    *     terminating character is needed - e.g., &lt$ would be unescaped, but
1293    *     not &ltab - see the tests for a more in-depth description of browsers)
1294    * @return a <code>String</code> value
1295    * @throws NullPointerException if the input string is null.
1296    */
unescapeHTML(String s, boolean emulateBrowsers)1297   public static final String unescapeHTML(String s, boolean emulateBrowsers) {
1298 
1299     // See if there are any '&' in the string since that is what we look
1300     // for to escape. If there isn't, then we don't need to escape this string
1301     // Based on similar technique used in the escape function.
1302     int index = s.indexOf('&');
1303     if (index == -1) {
1304       // Nothing to escape. Return the original string.
1305       return s;
1306     }
1307 
1308     // We found an escaped character. Start slow escaping from there.
1309     char[] chars = s.toCharArray();
1310     char[] escaped = new char[chars.length];
1311     System.arraycopy(chars, 0, escaped, 0, index);
1312 
1313     // Note: escaped[pos] = end of the escaped char array.
1314     int pos = index;
1315 
1316     for (int i = index; i < chars.length;) {
1317       if (chars[i] != '&') {
1318         escaped[pos++] = chars[i++];
1319         continue;
1320       }
1321 
1322       // Allow e.g. &#123;
1323       int j = i + 1;
1324       boolean isNumericEntity = false;
1325       if (j < chars.length && chars[j] == '#') {
1326         j++;
1327         isNumericEntity = true;
1328       }
1329 
1330       // if it's numeric, also check for hex
1331       boolean isHexEntity = false;
1332       if (j < chars.length && (chars[j] == 'x' || chars[j] == 'X')) {
1333         j++;
1334         isHexEntity = true;
1335       }
1336 
1337       // Scan until we find a char that is not valid for this sequence.
1338       for (; j < chars.length; j++) {
1339         char ch = chars[j];
1340         boolean isDigit = Character.isDigit(ch);
1341         if (isNumericEntity) {
1342           // non-hex numeric sequence end condition
1343           if (!isHexEntity && !isDigit) {
1344             break;
1345           }
1346           // hex sequence end contition
1347           if (isHexEntity && !isDigit && !HEX_LETTERS.contains(ch)) {
1348             break;
1349           }
1350         }
1351         // anything other than a digit or letter is always an end condition
1352         if (!isDigit && !Character.isLetter(ch)) {
1353           break;
1354         }
1355       }
1356 
1357       boolean replaced = false;
1358       if ((j <= chars.length && emulateBrowsers) ||
1359           (j < chars.length && chars[j] == ';')) {
1360         // Check for &#D; and &#xD; pattern
1361         if (i + 2 < chars.length && s.charAt(i + 1) == '#') {
1362           try {
1363             long charcode = 0;
1364             char ch = s.charAt(i + 2);
1365             if (isHexEntity) {
1366               charcode = Long.parseLong(
1367                   new String(chars, i + 3, j - i - 3), 16);
1368             } else if (Character.isDigit(ch)) {
1369               charcode = Long.parseLong(
1370                   new String(chars, i + 2, j - i - 2));
1371             }
1372             if (charcode > 0 && charcode < 65536) {
1373               escaped[pos++] = (char) charcode;
1374               replaced = true;
1375             }
1376           } catch (NumberFormatException ex) {
1377             // Failed, not replaced.
1378           }
1379         } else {
1380           String key = new String(chars, i, j - i);
1381           Character repl = ESCAPE_STRINGS.get(key);
1382           if (repl != null) {
1383             escaped[pos++] = repl;
1384             replaced = true;
1385           }
1386         }
1387         // Skip over ';'
1388         if (j < chars.length && chars[j] == ';') {
1389           j++;
1390         }
1391       }
1392 
1393       if (!replaced) {
1394         // Not a recognized escape sequence, leave as-is
1395         System.arraycopy(chars, i, escaped, pos, j - i);
1396         pos += j - i;
1397       }
1398       i = j;
1399     }
1400     return new String(escaped, 0, pos);
1401   }
1402 
1403   // Escaper for < and > only.
1404   private static final CharEscaper LT_GT_ESCAPE =
1405       new CharEscaperBuilder()
1406         .addEscape('<', "&lt;")
1407         .addEscape('>', "&gt;")
1408         .toEscaper();
1409 
1410   private static final Pattern htmlTagPattern =
1411       Pattern.compile("</?[a-zA-Z][^>]*>");
1412 
1413   /**
1414    * Given a <code>String</code>, returns an equivalent <code>String</code> with
1415    * all HTML tags stripped. Note that HTML entities, such as "&amp;amp;" will
1416    * still be preserved.
1417    */
stripHtmlTags(String string)1418   public static String stripHtmlTags(String string) {
1419     if ((string == null) || "".equals(string)) {
1420       return string;
1421     }
1422     String stripped = htmlTagPattern.matcher(string).replaceAll("");
1423     /*
1424      * Certain inputs result in a well-formed HTML:
1425      * <<X>script>alert(0)<</X>/script> results in <script>alert(0)</script>
1426      * The following step ensures that no HTML can slip through by replacing all
1427      * < and > characters with &lt; and &gt; after HTML tags were stripped.
1428      */
1429     return LT_GT_ESCAPE.escape(stripped);
1430   }
1431 
1432   /**
1433    * We escape some characters in s to be able to insert strings into JavaScript
1434    * code. Also, make sure that we don't write out {@code -->} or
1435    * {@code </script>}, which may close a script tag, or any char in ["'>] which
1436    * might close a tag or attribute if seen inside an attribute.
1437    */
javaScriptEscape(CharSequence s)1438   public static String javaScriptEscape(CharSequence s) {
1439     return javaScriptEscapeHelper(s, false);
1440   }
1441 
1442   /**
1443    * We escape some characters in s to be able to insert strings into JavaScript
1444    * code. Also, make sure that we don't write out {@code -->} or
1445    * {@code </script>}, which may close a script tag, or any char in ["'>] which
1446    * might close a tag or attribute if seen inside an attribute.
1447    * Turns all non-ascii characters into ASCII javascript escape sequences
1448    * (eg \\uhhhh or \ooo).
1449    */
javaScriptEscapeToAscii(CharSequence s)1450   public static String javaScriptEscapeToAscii(CharSequence s) {
1451     return javaScriptEscapeHelper(s, true);
1452   }
1453 
1454   /**
1455    * Represents the type of javascript escaping to perform.  Each enum below
1456    * determines whether to use octal escapes and how to handle quotes.
1457    */
1458   public static enum JsEscapingMode {
1459     /** No octal escapes, pass-through ', and escape " as \". */
1460     JSON,
1461 
1462     /** Octal escapes, escapes ' and " to \42 and \47, respectively. */
1463     EMBEDDABLE_JS,
1464 
1465     /** Octal escapes, escapes ' and " to \' and \". */
1466     MINIMAL_JS
1467   }
1468 
1469   /**
1470    * Helper for javaScriptEscape and javaScriptEscapeToAscii
1471    */
javaScriptEscapeHelper(CharSequence s, boolean escapeToAscii)1472   private static String javaScriptEscapeHelper(CharSequence s,
1473                                                boolean escapeToAscii) {
1474     StringBuilder sb = new StringBuilder(s.length() * 9 / 8);
1475     try {
1476       escapeStringBody(s, escapeToAscii, JsEscapingMode.EMBEDDABLE_JS, sb);
1477     } catch (IOException ex) {
1478       // StringBuilder.append does not throw IOExceptions.
1479       throw new RuntimeException(ex);
1480     }
1481     return sb.toString();
1482   }
1483 
1484   /**
1485    * Appends the javascript string literal equivalent of plainText to the given
1486    * out buffer.
1487    * @param plainText the string to escape.
1488    * @param escapeToAscii true to encode all characters not in ascii [\x20-\x7e]
1489    *   <br>
1490    *   Full escaping of unicode entites isn't required but this makes
1491    *   sure that unicode strings will survive regardless of the
1492    *   content-encoding of the javascript file which is important when
1493    *   we use this function to autogenerated javascript source files.
1494    *   This is disabled by default because it makes non-latin strings very long.
1495    *   <br>
1496    *   If you seem to have trouble with character-encodings, maybe
1497    *   turn this on to see if the problem goes away.  If so, you need
1498    *   to specify a character encoding for your javascript somewhere.
1499    * @param jsEscapingMode determines the type of escaping to perform.
1500    * @param out the buffer to append output to.
1501    */
1502   /*
1503    * To avoid fallthrough, we would have to either use a hybrid switch-case/if
1504    * approach (which would obscure our special handling for ' and "), duplicate
1505    * the content of the default case, or pass a half-dozen parameters to a
1506    * helper method containing the code from the default case.
1507    */
1508   @SuppressWarnings("fallthrough")
escapeStringBody( CharSequence plainText, boolean escapeToAscii, JsEscapingMode jsEscapingMode, Appendable out)1509   public static void escapeStringBody(
1510       CharSequence plainText, boolean escapeToAscii,
1511       JsEscapingMode jsEscapingMode, Appendable out)
1512       throws IOException {
1513     int pos = 0;  // Index just past the last char in plainText written to out.
1514     int len = plainText.length();
1515     for (int codePoint, charCount, i = 0; i < len; i += charCount) {
1516       codePoint = Character.codePointAt(plainText, i);
1517       charCount = Character.charCount(codePoint);
1518 
1519       if (!shouldEscapeChar(codePoint, escapeToAscii, jsEscapingMode)) {
1520         continue;
1521       }
1522 
1523       out.append(plainText, pos, i);
1524       pos = i + charCount;
1525       switch (codePoint) {
1526         case '\b': out.append("\\b"); break;
1527         case '\t': out.append("\\t"); break;
1528         case '\n': out.append("\\n"); break;
1529         case '\f': out.append("\\f"); break;
1530         case '\r': out.append("\\r"); break;
1531         case '\\': out.append("\\\\"); break;
1532         case '"': case '\'':
1533           if (jsEscapingMode == JsEscapingMode.JSON && '\'' == codePoint) {
1534             // JSON does not escape a single quote (and it should be surrounded
1535             // by double quotes).
1536             out.append((char) codePoint);
1537             break;
1538           } else if (jsEscapingMode != JsEscapingMode.EMBEDDABLE_JS) {
1539             out.append('\\').append((char) codePoint);
1540             break;
1541           }
1542           // fall through
1543         default:
1544           if (codePoint >= 0x100 || jsEscapingMode == JsEscapingMode.JSON) {
1545             appendHexJavaScriptRepresentation(codePoint, out);
1546           } else {
1547             // Output the minimal octal encoding.  We can't use an encoding
1548             // shorter than three digits if the next digit is a valid octal
1549             // digit.
1550             boolean pad = i + charCount >= len
1551                 || isOctal(plainText.charAt(i + charCount));
1552             appendOctalJavaScriptRepresentation((char) codePoint, pad, out);
1553           }
1554           break;
1555       }
1556     }
1557     out.append(plainText, pos, len);
1558   }
1559 
1560   /**
1561    * Helper for escapeStringBody, which decides whether to escape a character.
1562    */
shouldEscapeChar(int codePoint, boolean escapeToAscii, JsEscapingMode jsEscapingMode)1563   private static boolean shouldEscapeChar(int codePoint,
1564       boolean escapeToAscii, JsEscapingMode jsEscapingMode) {
1565     // If non-ASCII chars should be escaped, identify non-ASCII code points.
1566     if (escapeToAscii && (codePoint < 0x20 || codePoint > 0x7e)) {
1567       return true;
1568     }
1569 
1570     // If in JSON escaping mode, check JSON *and* JS escaping rules. The JS
1571     // escaping rules will escape more characters than needed for JSON,
1572     // but it is safe to escape any character in JSON.
1573     // TODO(bbavar): Remove unnecessary escaping for JSON, as long as it can be
1574     //               shown that this change in legacy behavior is safe.
1575     if (jsEscapingMode == JsEscapingMode.JSON) {
1576       return mustEscapeCharInJsonString(codePoint)
1577           || mustEscapeCharInJsString(codePoint);
1578     }
1579 
1580     // Finally, just check the default JS escaping rules.
1581     return mustEscapeCharInJsString(codePoint);
1582   }
1583 
1584   /**
1585    * Returns a javascript representation of the character in a hex escaped
1586    * format.
1587    *
1588    * @param codePoint The codepoint to append.
1589    * @param out The buffer to which the hex representation should be appended.
1590    */
appendHexJavaScriptRepresentation( int codePoint, Appendable out)1591   private static void appendHexJavaScriptRepresentation(
1592       int codePoint, Appendable out)
1593       throws IOException {
1594     if (Character.isSupplementaryCodePoint(codePoint)) {
1595       // Handle supplementary unicode values which are not representable in
1596       // javascript.  We deal with these by escaping them as two 4B sequences
1597       // so that they will round-trip properly when sent from java to javascript
1598       // and back.
1599       char[] surrogates = Character.toChars(codePoint);
1600       appendHexJavaScriptRepresentation(surrogates[0], out);
1601       appendHexJavaScriptRepresentation(surrogates[1], out);
1602       return;
1603     }
1604     out.append("\\u")
1605         .append(HEX_CHARS[(codePoint >>> 12) & 0xf])
1606         .append(HEX_CHARS[(codePoint >>> 8) & 0xf])
1607         .append(HEX_CHARS[(codePoint >>> 4) & 0xf])
1608         .append(HEX_CHARS[codePoint & 0xf]);
1609   }
1610 
1611   /**
1612    * Returns a javascript representation of the character in a hex escaped
1613    * format. Although this is a rather specific method, it is made public
1614    * because it is also used by the JSCompiler.
1615    *
1616    * @param ch The character to append.
1617    * @param pad true to force use of the full 3 digit representation.
1618    * @param out The buffer to which the hex representation should be appended.
1619    */
appendOctalJavaScriptRepresentation( char ch, boolean pad, Appendable out)1620   private static void appendOctalJavaScriptRepresentation(
1621       char ch, boolean pad, Appendable out) throws IOException {
1622     if (ch >= 0100
1623         // Be paranoid at the end of a string since someone might call
1624         // this method again with another string segment.
1625         || pad) {
1626       out.append('\\')
1627           .append(OCTAL_CHARS[(ch >>> 6) & 0x7])
1628           .append(OCTAL_CHARS[(ch >>> 3) & 0x7])
1629           .append(OCTAL_CHARS[ch & 0x7]);
1630     } else if (ch >= 010) {
1631       out.append('\\')
1632           .append(OCTAL_CHARS[(ch >>> 3) & 0x7])
1633           .append(OCTAL_CHARS[ch & 0x7]);
1634     } else {
1635       out.append('\\')
1636           .append(OCTAL_CHARS[ch & 0x7]);
1637     }
1638   }
1639 
1640   /**
1641    * Although this is a rather specific method, it is made public
1642    * because it is also used by the JSCompiler.
1643    *
1644    * @see #appendHexJavaScriptRepresentation(int, Appendable)
1645    */
appendHexJavaScriptRepresentation(StringBuilder sb, char c)1646   public static void appendHexJavaScriptRepresentation(StringBuilder sb,
1647                                                        char c) {
1648     try {
1649       appendHexJavaScriptRepresentation(c, sb);
1650     } catch (IOException ex) {
1651       // StringBuilder does not throw IOException.
1652       throw new RuntimeException(ex);
1653     }
1654   }
1655 
1656   /**
1657    * Undo escaping as performed in javaScriptEscape(.)
1658    * Throws an IllegalArgumentException if the string contains
1659    * bad escaping.
1660    */
javaScriptUnescape(String s)1661   public static String javaScriptUnescape(String s) {
1662     StringBuilder sb = new StringBuilder(s.length());
1663     for (int i = 0; i < s.length(); ) {
1664       char c = s.charAt(i);
1665       if (c == '\\') {
1666         i = javaScriptUnescapeHelper(s, i + 1, sb);
1667       } else {
1668         sb.append(c);
1669         i++;
1670       }
1671     }
1672     return sb.toString();
1673   }
1674 
1675   /**
1676    * Looks for an escape code starting at index i of s,
1677    * and appends it to sb.
1678    * @return the index of the first character in s
1679    * after the escape code.
1680    * @throws IllegalArgumentException if the escape code
1681    * is invalid
1682    */
javaScriptUnescapeHelper(String s, int i, StringBuilder sb)1683   private static int javaScriptUnescapeHelper(String s, int i,
1684                                               StringBuilder sb) {
1685     if (i >= s.length()) {
1686       throw new IllegalArgumentException(
1687           "End-of-string after escape character in [" + s + "]");
1688     }
1689 
1690     char c = s.charAt(i++);
1691     switch (c) {
1692       case 'n': sb.append('\n'); break;
1693       case 'r': sb.append('\r'); break;
1694       case 't': sb.append('\t'); break;
1695       case 'b': sb.append('\b'); break;
1696       case 'f': sb.append('\f'); break;
1697       case '\\':
1698       case '\"':
1699       case '\'':
1700       case '>':
1701         sb.append(c);
1702         break;
1703       case '0': case '1': case '2': case '3':
1704       case '4': case '5': case '6': case '7':
1705         --i;  // backup to first octal digit
1706         int nOctalDigits = 1;
1707         int digitLimit = c < '4' ? 3 : 2;
1708         while (nOctalDigits < digitLimit && i + nOctalDigits < s.length()
1709                && isOctal(s.charAt(i + nOctalDigits))) {
1710           ++nOctalDigits;
1711         }
1712         sb.append(
1713             (char) Integer.parseInt(s.substring(i, i + nOctalDigits), 8));
1714         i += nOctalDigits;
1715         break;
1716       case 'x':
1717       case 'u':
1718         String hexCode;
1719         int nHexDigits = (c == 'u' ? 4 : 2);
1720         try {
1721           hexCode = s.substring(i, i + nHexDigits);
1722         } catch (IndexOutOfBoundsException ioobe) {
1723           throw new IllegalArgumentException(
1724               "Invalid unicode sequence [" + s.substring(i) + "] at index " + i
1725               + " in [" + s + "]");
1726         }
1727         int unicodeValue;
1728         try {
1729           unicodeValue = Integer.parseInt(hexCode, 16);
1730         } catch (NumberFormatException nfe) {
1731           throw new IllegalArgumentException(
1732               "Invalid unicode sequence [" + hexCode + "] at index " + i +
1733               " in [" + s + "]");
1734         }
1735         sb.append((char) unicodeValue);
1736         i += nHexDigits;
1737         break;
1738       default:
1739         throw new IllegalArgumentException(
1740             "Unknown escape code [" + c + "] at index " + i + " in [" + s + "]"
1741             );
1742     }
1743 
1744     return i;
1745   }
1746 
1747   // C0 control characters except \t, \n, and \r and 0xFFFE and 0xFFFF
1748   private static final CharMatcher CONTROL_MATCHER = CharMatcher.anyOf(
1749       "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
1750       "\u0008\u000B\u000C\u000E\u000F" +
1751       "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
1752       "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
1753       "\uFFFE\uFFFF");
1754 
1755   /**
1756    * Escape a string that is meant to be embedded in a CDATA section.
1757    * The returned string is guaranteed to be valid CDATA content.
1758    * The syntax of CDATA sections is the following:
1759    * <blockquote>
1760    *   <code>&lt;[!CDATA[...]]&gt;</code>
1761    * </blockquote>
1762    * The only invalid character sequence in a CDATA tag is "]]&gt;".
1763    * If this sequence is present in the input string, we replace
1764    * it by closing the current CDATA field, then write ']]&amp;gt;',
1765    * then reopen a new CDATA section.
1766    */
1767   public static String xmlCDataEscape(String s) {
1768      // Make sure there are no illegal control characters.
1769      s = CONTROL_MATCHER.removeFrom(s);
1770     // Return the original reference if the string doesn't have a match.
1771     int found = s.indexOf("]]>");
1772     if (found == -1) {
1773       return s;
1774     }
1775 
1776     // For each occurrence of "]]>", append a string that adds "]]&gt;" after
1777     // the end of the CDATA which has just been closed, then opens a new CDATA.
1778     StringBuilder sb = new StringBuilder();
1779     int prev = 0;
1780     do {
1781       sb.append(s.substring(prev, found + 3));
1782       sb.append("]]&gt;<![CDATA[");
1783       prev = found + 3;
1784     } while ((found = s.indexOf("]]>", prev)) != -1);
1785     sb.append(s.substring(prev));
1786     return sb.toString();
1787   }
1788 
1789   /**
1790    * We escape some characters in s to be able to insert strings into Java code
1791    *
1792    * @deprecated Use {@link CharEscapers#asciiHtmlEscaper()} and {@link
1793    * CharEscapers#javaCharEscaper()} or {@link CharEscapers#javaStringEscaper()}
1794    * instead. This method combines two forms of escaping in a way that's rarely
1795    * desired.
1796    */
1797   @Deprecated
1798   public static String javaEscape(String s) {
1799     return JAVA_ESCAPE.escape(s);
1800   }
1801 
1802   // Java escaper.
1803   private static final CharEscaper JAVA_ESCAPE =
1804       new CharEscaperBuilder()
1805         .addEscape('\n', "\\n")
1806         .addEscape('\r', "\\r")
1807         .addEscape('\t', "\\t")
1808         .addEscape('\\', "\\\\")
1809         .addEscape('\"', "\\\"")
1810         .addEscape('&', "&amp;")
1811         .addEscape('<', "&lt;")
1812         .addEscape('>', "&gt;")
1813         .addEscape('\'', "\\\'")
1814         .toEscaper();
1815 
1816   /**
1817    * Escapes the special characters from a string so it can be used as part of
1818    * a regex pattern. This method is for use on gnu.regexp style regular
1819    * expressions.
1820    *
1821    * @deprecated Use {@link Pattern#quote(String)} instead. Note that it may not
1822    * be compatible with gnu.regexp style regular expressions.
1823    */
1824   @Deprecated
1825   public static String regexEscape(String s) {
1826     return REGEX_ESCAPE.escape(s);
1827   }
1828 
1829   // Regex escaper escapes all regex characters.
1830   private static final CharEscaper REGEX_ESCAPE =
1831       new CharEscaperBuilder()
1832         .addEscape('(', "\\(")
1833         .addEscape(')', "\\)")
1834         .addEscape('|', "\\|")
1835         .addEscape('*', "\\*")
1836         .addEscape('+', "\\+")
1837         .addEscape('?', "\\?")
1838         .addEscape('.', "\\.")
1839         .addEscape('{', "\\{")
1840         .addEscape('}', "\\}")
1841         .addEscape('[', "\\[")
1842         .addEscape(']', "\\]")
1843         .addEscape('$', "\\$")
1844         .addEscape('^', "\\^")
1845         .addEscape('\\', "\\\\")
1846         .toEscaper();
1847 
1848   /**
1849    *  If you want to preserve the exact
1850    * current (odd) behavior when {@code doStrip} is {@code true}, use
1851    * {@code .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on
1852    * the splitter.
1853    *
1854    * @param in what to process
1855    * @param delimiter the delimiting string
1856    * @return the tokens
1857    * @deprecated see the detailed instructions under
1858    *     {@link #split(String, String, boolean)}
1859    */
1860   @Deprecated
1861   public static LinkedList<String> string2List(
1862       String in, String delimiter, boolean doStrip) {
1863     if (in == null) {
1864       return null;
1865     }
1866 
1867     LinkedList<String> out = new LinkedList<String>();
1868     string2Collection(in, delimiter, doStrip, out);
1869     return out;
1870   }
1871 
1872   /**
1873    * See the detailed instructions under {@link
1874    * #split(String, String, boolean)}. Pass the resulting {@code Iterable} to
1875    * {@link com.google.common.collect.Sets#newHashSet(Iterable)}. If you want to
1876    * preserve the exact current (odd) behavior when {@code doStrip} is {@code
1877    * true}, use {@code
1878    * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the
1879    * splitter.
1880    *
1881    * @param in what to process
1882    * @param delimiter the delimiting string
1883    * @param doStrip to strip the substrings before adding to the list
1884    * @return the tokens
1885    * @deprecated see the detailed instructions under
1886    *     {@link #split(String, String, boolean)}
1887    */
1888   @Deprecated
1889   public static Set<String> string2Set(
1890        String in, String delimiter, boolean doStrip) {
1891     if (in == null) {
1892       return null;
1893     }
1894 
1895     HashSet<String> out = new HashSet<String>();
1896     string2Collection(in, delimiter, doStrip, out);
1897     return out;
1898   }
1899 
1900   /**
1901    * See the detailed instructions under {@link
1902    * #split(String, String, boolean)}. If you want to preserve the exact current
1903    * (odd) behavior when {@code doStrip} is {@code true}, use {@code
1904    * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the
1905    * splitter.
1906    *
1907    * @param in The delimited input string to process
1908    * @param delimiter The string delimiting entries in the input string.
1909    * @param doStrip whether to strip the substrings before adding to the
1910    *          collection
1911    * @param collection The collection to which the strings will be added. If
1912    *          <code>null</code>, a new <code>List</code> will be created.
1913    * @return The collection to which the substrings were added. This is
1914    *         syntactic sugar to allow call chaining.
1915    * @deprecated see the detailed instructions under
1916    *     {@link #split(String, String, boolean)}
1917    */
1918   @Deprecated
1919   public static Collection<String> string2Collection(
1920       String in,
1921       String delimiter,
1922       boolean doStrip,
1923       Collection<String> collection) {
1924     if (in == null) {
1925       return null;
1926     }
1927     if (collection == null) {
1928       collection = new ArrayList<String>();
1929     }
1930     if (delimiter == null || delimiter.length() == 0) {
1931       collection.add(in);
1932       return collection;
1933     }
1934 
1935     int fromIndex = 0;
1936     int pos;
1937     while ((pos = in.indexOf(delimiter, fromIndex)) >= 0) {
1938       String interim = in.substring(fromIndex, pos);
1939       if (doStrip) {
1940         interim = strip(interim);
1941       }
1942       if (!doStrip || interim.length() > 0) {
1943         collection.add(interim);
1944       }
1945 
1946       fromIndex = pos + delimiter.length();
1947     }
1948 
1949     String interim = in.substring(fromIndex);
1950     if (doStrip) {
1951       interim = strip(interim);
1952     }
1953     if (!doStrip || interim.length() > 0) {
1954       collection.add(interim);
1955     }
1956 
1957     return collection;
1958   }
1959 
1960   /**
1961    * This converts a string to a Map. It will first split the string into
1962    * entries using delimEntry. Then each entry is split into a key and a value
1963    * using delimKey. By default we strip the keys. Use doStripEntry to strip
1964    * also the entries.
1965    *
1966    * Note that this method returns a {@link HashMap}, which means that entries
1967    * will be in no particular order. See {@link #stringToOrderedMap}.
1968    *
1969    * @param in the string to be processed
1970    * @param delimEntry delimiter for the entries
1971    * @param delimKey delimiter between keys and values
1972    * @param doStripEntry strip entries before inserting in the map
1973    *
1974    * @return HashMap
1975    */
string2Map( String in, String delimEntry, String delimKey, boolean doStripEntry)1976   public static HashMap<String, String> string2Map(
1977       String in, String delimEntry, String delimKey,
1978       boolean doStripEntry) {
1979     if (in == null) {
1980       return null;
1981     }
1982 
1983     return stringToMapImpl(new HashMap<String, String>(), in, delimEntry,
1984         delimKey, doStripEntry);
1985   }
1986 
1987   /**
1988    * This converts a string to a Map, with entries in the same order as the
1989    * key/value pairs in the input string. It will first split the string into
1990    * entries using delimEntry. Then each entry is split into a key and a value
1991    * using delimKey. By default we strip the keys. Use doStripEntry to strip
1992    * also the entries.
1993    *
1994    * @param in the string to be processed
1995    * @param delimEntry delimiter for the entries
1996    * @param delimKey delimiter between keys and values
1997    * @param doStripEntry strip entries before inserting in the map
1998    *
1999    * @return key/value pairs as a Map, in order
2000    */
stringToOrderedMap( String in, String delimEntry, String delimKey, boolean doStripEntry)2001   public static Map<String, String> stringToOrderedMap(
2002       String in, String delimEntry, String delimKey,
2003       boolean doStripEntry) {
2004     if (in == null) {
2005       return null;
2006     }
2007 
2008     return stringToMapImpl(new LinkedHashMap<String, String>(), in, delimEntry,
2009         delimKey, doStripEntry);
2010   }
2011 
2012   /**
2013    * This adds key/value pairs from the given string to the given Map.
2014    * It will first split the string into entries using delimEntry. Then each
2015    * entry is split into a key and a value using delimKey. By default we
2016    * strip the keys. Use doStripEntry to strip also the entries.
2017    *
2018    * @param out - Map to output into
2019    * @param in - the string to be processed
2020    * @param delimEntry - delimiter for the entries
2021    * @param delimKey - delimiter between keys and values
2022    * @param doStripEntry - strip entries before inserting in the map
2023    * @return out, for caller's convenience
2024    */
stringToMapImpl(T out, String in, String delimEntry, String delimKey, boolean doStripEntry)2025   private static <T extends Map<String, String>> T stringToMapImpl(T out,
2026       String in, String delimEntry, String delimKey, boolean doStripEntry) {
2027 
2028     if (isEmpty(delimEntry) || isEmpty(delimKey)) {
2029       out.put(strip(in), "");
2030       return out;
2031     }
2032 
2033     Iterator<String> it = string2List(in, delimEntry, false).iterator();
2034     int len = delimKey.length();
2035     while (it.hasNext()) {
2036       String entry = it.next();
2037       int pos = entry.indexOf(delimKey);
2038       if (pos > 0) {
2039         String value = entry.substring(pos + len);
2040         if (doStripEntry) {
2041           value = strip(value);
2042         }
2043         out.put(strip(entry.substring(0, pos)), value);
2044       } else {
2045         out.put(strip(entry), "");
2046       }
2047     }
2048 
2049     return out;
2050   }
2051 
2052   /**
2053    * This function concatenates the elements of a Map in a string with form
2054    *  "<key1><sepKey><value1><sepEntry>...<keyN><sepKey><valueN>"
2055    *
2056    * @param in - the map to be converted
2057    * @param sepKey - the separator to put between key and value
2058    * @param sepEntry - the separator to put between map entries
2059    * @return String
2060    * @deprecated create a {@link MapJoiner}, for example {@code
2061    *     Joiner.on(sepEntry).withKeyValueSeparator(sepKey)}. Ensure that your
2062    *     map is non-null and use this map joiner's {@link MapJoiner#join(Map)}
2063    *     method. To preserve behavior exactly, just in-line this method call.
2064    */
map2String( Map<K, V> in, String sepKey, String sepEntry)2065   @Deprecated public static <K, V> String map2String(
2066       Map<K, V> in, String sepKey, String sepEntry) {
2067     return (in == null) ? null : Joiner
2068         .on(sepEntry)
2069         .useForNull("null")
2070         .withKeyValueSeparator(sepKey)
2071         .join(in);
2072   }
2073 
2074   /**
2075    * Given a map, creates and returns a new map in which all keys are the
2076    * lower-cased version of each key.
2077    *
2078    * @param map A map containing String keys to be lowercased
2079    * @throws IllegalArgumentException if the map contains duplicate string keys
2080    *           after lower casing
2081    */
lowercaseKeys(Map<String, V> map)2082   public static <V> Map<String, V> lowercaseKeys(Map<String, V> map) {
2083     Map<String, V> result = new HashMap<String, V>(map.size());
2084     for (Map.Entry<String, V> entry : map.entrySet()) {
2085       String key = entry.getKey();
2086       if (result.containsKey(key.toLowerCase())) {
2087         throw new IllegalArgumentException(
2088             "Duplicate string key in map when lower casing");
2089       }
2090       result.put(key.toLowerCase(), entry.getValue());
2091     }
2092     return result;
2093   }
2094 
2095   /**
2096    * Replaces any string of adjacent whitespace characters with the whitespace
2097    * character " ".
2098    *
2099    * @param str the string you want to munge
2100    * @return String with no more excessive whitespace!
2101    * @deprecated ensure the string is not null and use {@code
2102    *     CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ')}; also consider
2103    *     whether you really want the legacy whitespace definition, or something
2104    *     more standard like {@link CharMatcher#WHITESPACE}.
2105    */
collapseWhitespace(String str)2106   @Deprecated public static String collapseWhitespace(String str) {
2107     return (str == null) ? null
2108         : CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ');
2109   }
2110 
2111   /**
2112    * Replaces any string of matched characters with the supplied string.<p>
2113    *
2114    * This is a more general version of collapseWhitespace.
2115    *
2116    * <pre>
2117    *   E.g. collapse("hello     world", " ", "::")
2118    *   will return the following string: "hello::world"
2119    * </pre>
2120    *
2121    * @param str the string you want to munge
2122    * @param chars all of the characters to be considered for munge
2123    * @param replacement the replacement string
2124    * @return munged and replaced string.
2125    * @deprecated if {@code replacement} is the empty string, use {@link
2126    *     CharMatcher#removeFrom(CharSequence)}; if it is a single character,
2127    *     use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
2128    *     replacement strings use {@link String#replaceAll(String, String)} with
2129    *     a regular expression that matches one or more occurrences of {@code
2130    *     chars}. In all cases you must first ensure that {@code str} is not
2131    *     null.
2132    */
collapse( String str, String chars, String replacement)2133   @Deprecated public static String collapse(
2134       String str, String chars, String replacement) {
2135     if (str == null) {
2136       return null;
2137     }
2138 
2139     StringBuilder newStr = new StringBuilder();
2140 
2141     boolean prevCharMatched = false;
2142     char c;
2143     for (int i = 0; i < str.length(); i++) {
2144       c = str.charAt(i);
2145       if (chars.indexOf(c) != -1) {
2146         // this character is matched
2147         if (prevCharMatched) {
2148           // apparently a string of matched chars, so don't append anything
2149           // to the string
2150           continue;
2151         }
2152         prevCharMatched = true;
2153         newStr.append(replacement);
2154       } else {
2155         prevCharMatched = false;
2156         newStr.append(c);
2157       }
2158     }
2159 
2160     return newStr.toString();
2161   }
2162 
2163   /**
2164    * Returns a string with all sequences of ISO control chars (0x00 to 0x1F and
2165    * 0x7F to 0x9F) replaced by the supplied string.  ISO control characters are
2166    * identified via {@link Character#isISOControl(char)}.
2167    *
2168    * @param str the string you want to strip of ISO control chars
2169    * @param replacement the replacement string
2170    * @return a String with all control characters replaced by the replacement
2171    * string, or null if input is null.
2172    * @deprecated use {@link CharMatcher#JAVA_ISO_CONTROL}. If {@code
2173    *     replacement} is the empty string, use {@link
2174    *     CharMatcher#removeFrom(CharSequence)}; if it is a single character,
2175    *     use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
2176    *     replacement strings use
2177    *     {@code str.replaceAll("\p{Cntrl}+", replacement)}.
2178    *     In all cases you must first ensure that {@code str} is not null.
2179    */
collapseControlChars( String str, String replacement)2180   @Deprecated public static String collapseControlChars(
2181       String str, String replacement) {
2182     /*
2183      * We re-implement the StringUtil.collapse() loop here rather than call
2184      * collapse() with an input String of control chars, because matching via
2185      * isISOControl() is about 10x faster.
2186      */
2187     if (str == null) {
2188       return null;
2189     }
2190 
2191     StringBuilder newStr = new StringBuilder();
2192 
2193     boolean prevCharMatched = false;
2194     char c;
2195     for (int i = 0; i < str.length(); i++) {
2196       c = str.charAt(i);
2197       if (Character.isISOControl(c)) {
2198         // this character is matched
2199         if (prevCharMatched) {
2200           // apparently a string of matched chars, so don't append anything
2201           // to the string
2202           continue;
2203         }
2204         prevCharMatched = true;
2205         newStr.append(replacement);
2206       } else {
2207         prevCharMatched = false;
2208         newStr.append(c);
2209       }
2210     }
2211 
2212     return newStr.toString();
2213   }
2214 
2215   /**
2216    * Read a String of up to maxLength bytes from an InputStream.
2217    *
2218    * <p>Note that this method uses the default platform encoding, and expects
2219    * that encoding to be single-byte, which is not always the case. Its use
2220    * is discouraged. For reading the entire stream (maxLength == -1) you can use:
2221    * <pre>
2222    *   CharStreams.toString(new InputStreamReader(is, Charsets.ISO_8859_1))
2223    * </pre>
2224    * {@code CharStreams} is in the {@code com.google.common.io} package.
2225    *
2226    * <p>For maxLength >= 0 a literal translation would be
2227    * <pre>
2228    *   CharStreams.toString(new InputStreamReader(
2229    *       new LimitInputStream(is, maxLength), Charsets.ISO_8859_1))
2230    * </pre>
2231    * For multi-byte encodings that is broken because the limit could end in
2232    * the middle of the character--it would be better to limit the reader than
2233    * the underlying stream.
2234    *
2235    * @param is input stream
2236    * @param maxLength max number of bytes to read from "is". If this is -1, we
2237    *          read everything.
2238    *
2239    * @return String up to maxLength bytes, read from "is"
2240    * @deprecated see the advice above
2241    */
stream2String(InputStream is, int maxLength)2242   @Deprecated public static String stream2String(InputStream is, int maxLength)
2243       throws IOException {
2244     byte[] buffer = new byte[4096];
2245     StringWriter sw = new StringWriter();
2246     int totalRead = 0;
2247     int read = 0;
2248 
2249     do {
2250       sw.write(new String(buffer, 0, read));
2251       totalRead += read;
2252       read = is.read(buffer, 0, buffer.length);
2253     } while (((-1 == maxLength) || (totalRead < maxLength)) && (read != -1));
2254 
2255     return sw.toString();
2256   }
2257 
2258   /**
2259    * Parse a list of substrings separated by a given delimiter. The delimiter
2260    * can also appear in substrings (just double them):
2261    *
2262    * parseDelimitedString("this|is", '|') returns ["this","is"]
2263    * parseDelimitedString("this||is", '|') returns ["this|is"]
2264    *
2265    * @param list String containing delimited substrings
2266    * @param delimiter Delimiter (anything except ' ' is allowed)
2267    *
2268    * @return String[] A String array of parsed substrings
2269    */
parseDelimitedList(String list, char delimiter)2270   public static String[] parseDelimitedList(String list,
2271                                             char delimiter) {
2272     String delim = "" + delimiter;
2273     // Append a sentinel of delimiter + space
2274     // (see comments below for more info)
2275     StringTokenizer st = new StringTokenizer(list + delim + " ",
2276                                              delim,
2277                                              true);
2278     ArrayList<String> v = new ArrayList<String>();
2279     String lastToken = "";
2280     StringBuilder word = new StringBuilder();
2281 
2282     // We keep a sliding window of 2 tokens
2283     //
2284     // delimiter : delimiter -> append delimiter to current word
2285     //                          and clear most recent token
2286     //                          (so delim : delim : delim will not
2287     //                          be treated as two escaped delims.)
2288     //
2289     // tok : delimiter -> append tok to current word
2290     //
2291     // delimiter : tok -> add current word to list, and clear it.
2292     //                    (We append a sentinel that conforms to this
2293     //                    pattern to make sure we've pushed every parsed token)
2294     while (st.hasMoreTokens()) {
2295       String tok = st.nextToken();
2296       if (lastToken != null) {
2297         if (tok.equals(delim)) {
2298           word.append(lastToken);
2299           if (lastToken.equals(delim)) { tok = null; }
2300         } else {
2301           if (word.length() != 0) {
2302             v.add(word.toString());
2303           }
2304           word.setLength(0);
2305         }
2306       }
2307       lastToken = tok;
2308     }
2309 
2310     return v.toArray(new String[0]);
2311   }
2312 
2313   /**
2314    * Compares two strings, guarding against nulls.
2315    *
2316    * @param nullsAreGreater true if nulls should be greater than any string,
2317    *  false is less than.
2318    * @deprecated use {@link String#CASE_INSENSITIVE_ORDER}, together with
2319    *     {@link com.google.common.collect.Ordering#nullsFirst()} or
2320    *     {@link com.google.common.collect.Ordering#nullsLast()} if
2321    *     needed
2322    */
compareToIgnoreCase(String s1, String s2, boolean nullsAreGreater)2323   @Deprecated public static int compareToIgnoreCase(String s1, String s2,
2324       boolean nullsAreGreater) {
2325     if (s1 == s2) {
2326       return 0; // Either both the same String, or both null
2327     }
2328     if (s1 == null) {
2329       return nullsAreGreater ? 1 : -1;
2330     }
2331     if (s2 == null) {
2332       return nullsAreGreater ? -1 : 1;
2333     }
2334     return s1.compareToIgnoreCase(s2);
2335   }
2336 
2337   /**
2338    * Splits s with delimiters in delimiter and returns the last token
2339    */
lastToken(String s, String delimiter)2340   public static String lastToken(String s, String delimiter) {
2341     return s.substring(CharMatcher.anyOf(delimiter).lastIndexIn(s) + 1);
2342   }
2343 
2344   private static final Pattern characterReferencePattern =
2345       Pattern.compile("&#?[a-zA-Z0-9]{1,8};");
2346 
2347   /**
2348    * Determines if a string contains what looks like an html character
2349    * reference. Useful for deciding whether unescaping is necessary.
2350    */
containsCharRef(String s)2351   public static boolean containsCharRef(String s) {
2352     return characterReferencePattern.matcher(s).find();
2353   }
2354 
2355   /**
2356    * Determines if a string is a Hebrew word. A string is considered to be
2357    * a Hebrew word if {@link #isHebrew(int)} is true for any of its characters.
2358    */
isHebrew(String s)2359   public static boolean isHebrew(String s) {
2360     int len = s.length();
2361     for (int i = 0; i < len; ++i) {
2362       if (isHebrew(s.codePointAt(i))) {
2363         return true;
2364       }
2365     }
2366     return false;
2367   }
2368 
2369   /**
2370    * Determines if a character is a Hebrew character.
2371    */
isHebrew(int codePoint)2372   public static boolean isHebrew(int codePoint) {
2373     return Character.UnicodeBlock.HEBREW.equals(
2374                Character.UnicodeBlock.of(codePoint));
2375   }
2376 
2377   /**
2378    * Determines if a string is a CJK word. A string is considered to be CJK
2379    * if {@link #isCjk(char)} is true for any of its characters.
2380    */
isCjk(String s)2381   public static boolean isCjk(String s) {
2382     int len = s.length();
2383     for (int i = 0; i < len; ++i) {
2384       if (isCjk(s.codePointAt(i))) {
2385         return true;
2386       }
2387     }
2388     return false;
2389   }
2390 
2391   /**
2392    * Unicode code blocks containing CJK characters.
2393    */
2394   private static final Set<Character.UnicodeBlock> CJK_BLOCKS;
2395   static {
2396     Set<Character.UnicodeBlock> set = new HashSet<Character.UnicodeBlock>();
2397     set.add(Character.UnicodeBlock.HANGUL_JAMO);
2398     set.add(Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT);
2399     set.add(Character.UnicodeBlock.KANGXI_RADICALS);
2400     set.add(Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION);
2401     set.add(Character.UnicodeBlock.HIRAGANA);
2402     set.add(Character.UnicodeBlock.KATAKANA);
2403     set.add(Character.UnicodeBlock.BOPOMOFO);
2404     set.add(Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO);
2405     set.add(Character.UnicodeBlock.KANBUN);
2406     set.add(Character.UnicodeBlock.BOPOMOFO_EXTENDED);
2407     set.add(Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS);
2408     set.add(Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS);
2409     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY);
2410     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A);
2411     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS);
2412     set.add(Character.UnicodeBlock.HANGUL_SYLLABLES);
2413     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS);
2414     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS);
2415     set.add(Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS);
2416     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B);
2417     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT);
2418     CJK_BLOCKS = Collections.unmodifiableSet(set);
2419   }
2420 
2421   /**
2422    * Determines if a character is a CJK ideograph or a character typically
2423    * used only in CJK text.
2424    *
2425    * Note: This function cannot handle supplementary characters. To handle all
2426    * Unicode characters, including supplementary characters, use the function
2427    * {@link #isCjk(int)}.
2428    */
isCjk(char ch)2429   public static boolean isCjk(char ch) {
2430     return isCjk((int) ch);
2431   }
2432 
2433   /**
2434    * Determines if a character is a CJK ideograph or a character typically
2435    * used only in CJK text.
2436    */
isCjk(int codePoint)2437   public static boolean isCjk(int codePoint) {
2438     // Time-saving early exit for all Latin-1 characters.
2439     if ((codePoint & 0xFFFFFF00) == 0) {
2440       return false;
2441     }
2442 
2443     return CJK_BLOCKS.contains(Character.UnicodeBlock.of(codePoint));
2444   }
2445 
2446   /**
2447    * Returns the approximate display width of the string, measured in units of
2448    * ascii characters.
2449    *
2450    * @see StringUtil#displayWidth(char)
2451    */
displayWidth(String s)2452   public static int displayWidth(String s) {
2453     // TODO(kevinb): could reimplement this as
2454     // return s.length() * 2 - CharMatcher.SINGLE_WIDTH.countIn(s);
2455     int width = 0;
2456     int len = s.length();
2457     for (int i = 0; i < len; ++i) {
2458       width += displayWidth(s.charAt(i));
2459     }
2460     return width;
2461   }
2462 
2463   /**
2464    * Returns the approximate display width of the character, measured
2465    * in units of ascii characters.
2466    *
2467    * This method should err on the side of caution. By default, characters
2468    * are assumed to have width 2; this covers CJK ideographs, various
2469    * symbols and miscellaneous weird scripts. Given below are some Unicode
2470    * ranges for which it seems safe to assume that no character is
2471    * substantially wider than an ascii character:
2472    *   - Latin, extended Latin, even more extended Latin.
2473    *   - Greek, extended Greek, Cyrillic.
2474    *   - Some symbols (including currency symbols) and punctuation.
2475    *   - Half-width Katakana and Hangul.
2476    *   - Hebrew
2477    *   - Arabic
2478    *   - Thai
2479    * Characters in these ranges are given a width of 1.
2480    *
2481    * IMPORTANT: this function has analogs in C++ (encodingutils.cc,
2482    * named UnicodeCharWidth) and JavaScript
2483    * (java/com/google/ads/common/frontend/adwordsbase/resources/CreateAdUtil.js),
2484    * which need to be updated if you change the implementation here.
2485    */
displayWidth(char ch)2486   public static int displayWidth(char ch) {
2487     if (ch <= '\u04f9' ||   // CYRILLIC SMALL LETTER YERU WITH DIAERESIS
2488         ch == '\u05be' ||   // HEBREW PUNCTUATION MAQAF
2489         (ch >= '\u05d0' && ch <= '\u05ea') ||  // HEBREW LETTER ALEF ... TAV
2490         ch == '\u05F3' ||   // HEBREW PUNCTUATION GERESH
2491         ch == '\u05f4' ||   // HEBREW PUNCTUATION GERSHAYIM
2492         (ch >= '\u0600' && ch <= '\u06ff') || // Block=Arabic
2493         (ch >= '\u0750' && ch <= '\u077f') || // Block=Arabic_Supplement
2494         (ch >= '\ufb50' && ch <= '\ufdff') || // Block=Arabic_Presentation_Forms-A
2495         (ch >= '\ufe70' && ch <= '\ufeff') || // Block=Arabic_Presentation_Forms-B
2496         (ch >= '\u1e00' && ch <= '\u20af') || /* LATIN CAPITAL LETTER A WITH RING BELOW
2497                                                  ... DRACHMA SIGN */
2498         (ch >= '\u2100' && ch <= '\u213a') || // ACCOUNT OF ... ROTATED CAPITAL Q
2499         (ch >= '\u0e00' && ch <= '\u0e7f') || // Thai
2500         (ch >= '\uff61' && ch <= '\uffdc')) { /* HALFWIDTH IDEOGRAPHIC FULL STOP
2501                                                  ... HALFWIDTH HANGUL LETTER I */
2502       return 1;
2503     }
2504     return 2;
2505   }
2506 
2507   /**
2508    * @return a string representation of the given native array.
2509    */
toString(float[] iArray)2510   public static String toString(float[] iArray) {
2511     if (iArray == null) {
2512       return "NULL";
2513     }
2514 
2515     StringBuilder buffer = new StringBuilder();
2516     buffer.append("[");
2517     for (int i = 0; i < iArray.length; i++) {
2518       buffer.append(iArray[i]);
2519       if (i != (iArray.length - 1)) {
2520         buffer.append(", ");
2521       }
2522     }
2523     buffer.append("]");
2524     return buffer.toString();
2525   }
2526 
2527   /**
2528    * @return a string representation of the given native array.
2529    */
toString(long[] iArray)2530   public static String toString(long[] iArray) {
2531     if (iArray == null) {
2532       return "NULL";
2533     }
2534 
2535     StringBuilder buffer = new StringBuilder();
2536     buffer.append("[");
2537     for (int i = 0; i < iArray.length; i++) {
2538       buffer.append(iArray[i]);
2539       if (i != (iArray.length - 1)) {
2540         buffer.append(", ");
2541       }
2542     }
2543     buffer.append("]");
2544     return buffer.toString();
2545   }
2546 
2547   /**
2548    * @return a string representation of the given native array
2549    */
toString(int[] iArray)2550   public static String toString(int[] iArray) {
2551     if (iArray == null) {
2552       return "NULL";
2553     }
2554 
2555     StringBuilder buffer = new StringBuilder();
2556     buffer.append("[");
2557     for (int i = 0; i < iArray.length; i++) {
2558       buffer.append(iArray[i]);
2559       if (i != (iArray.length - 1)) {
2560         buffer.append(", ");
2561       }
2562     }
2563     buffer.append("]");
2564     return buffer.toString();
2565   }
2566 
2567   /**
2568    * @return a string representation of the given array.
2569    */
toString(String[] iArray)2570   public static String toString(String[] iArray) {
2571     if (iArray == null) { return "NULL"; }
2572 
2573     StringBuilder buffer = new StringBuilder();
2574     buffer.append("[");
2575     for (int i = 0; i < iArray.length; i++) {
2576       buffer.append("'").append(iArray[i]).append("'");
2577       if (i != iArray.length - 1) {
2578         buffer.append(", ");
2579       }
2580     }
2581     buffer.append("]");
2582 
2583     return buffer.toString();
2584   }
2585 
2586   /**
2587    * Returns the string, in single quotes, or "NULL". Intended only for
2588    * logging.
2589    *
2590    * @param s the string
2591    * @return the string, in single quotes, or the string "null" if it's null.
2592    */
toString(String s)2593   public static String toString(String s) {
2594     if (s == null) {
2595       return "NULL";
2596     } else {
2597       return new StringBuilder(s.length() + 2).append("'").append(s)
2598                                               .append("'").toString();
2599     }
2600   }
2601 
2602   /**
2603    * @return a string representation of the given native array
2604    */
toString(int[][] iArray)2605   public static String toString(int[][] iArray) {
2606     if (iArray == null) {
2607       return "NULL";
2608     }
2609 
2610     StringBuilder buffer = new StringBuilder();
2611     buffer.append("[");
2612     for (int i = 0; i < iArray.length; i++) {
2613       buffer.append("[");
2614       for (int j = 0; j < iArray[i].length; j++) {
2615         buffer.append(iArray[i][j]);
2616         if (j != (iArray[i].length - 1)) {
2617           buffer.append(", ");
2618         }
2619       }
2620       buffer.append("]");
2621       if (i != iArray.length - 1) {
2622         buffer.append(" ");
2623       }
2624     }
2625     buffer.append("]");
2626     return buffer.toString();
2627   }
2628 
2629   /**
2630    * @return a string representation of the given native array.
2631    */
toString(long[][] iArray)2632   public static String toString(long[][] iArray) {
2633     if (iArray == null) { return "NULL"; }
2634 
2635     StringBuilder buffer = new StringBuilder();
2636     buffer.append("[");
2637     for (int i = 0; i < iArray.length; i++) {
2638       buffer.append("[");
2639       for (int j = 0; j < iArray[i].length; j++) {
2640         buffer.append(iArray[i][j]);
2641         if (j != (iArray[i].length - 1)) {
2642           buffer.append(", ");
2643         }
2644       }
2645       buffer.append("]");
2646       if (i != iArray.length - 1) {
2647         buffer.append(" ");
2648       }
2649     }
2650     buffer.append("]");
2651     return buffer.toString();
2652   }
2653 
2654   /**
2655    * @return a String representation of the given object array.
2656    * The strings are obtained by calling toString() on the
2657    * underlying objects.
2658    */
toString(Object[] obj)2659   public static String toString(Object[] obj) {
2660     if (obj == null) { return "NULL"; }
2661     StringBuilder tmp = new StringBuilder();
2662     tmp.append("[");
2663     for (int i = 0; i < obj.length; i++) {
2664       tmp.append(obj[i].toString());
2665       if (i != obj.length - 1) {
2666         tmp.append(",");
2667       }
2668     }
2669     tmp.append("]");
2670     return tmp.toString();
2671   }
2672 
2673   private static final char[] HEX_CHARS
2674       = { '0', '1', '2', '3', '4', '5', '6', '7',
2675           '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
2676   private static final char[] OCTAL_CHARS = HEX_CHARS;  // ignore the last 8 :)
2677 
2678   /**
2679    * Convert a byte array to a hex-encoding string: "a33bff00..."
2680    *
2681    * @deprecated Use {@link ByteArrays#toHexString}.
2682    */
bytesToHexString(final byte[] bytes)2683   @Deprecated public static String bytesToHexString(final byte[] bytes) {
2684     return ByteArrays.toHexString(bytes);
2685   }
2686 
2687   /**
2688    * Convert a byte array to a hex-encoding string with the specified
2689    * delimiter: "a3&lt;delimiter&gt;3b&lt;delimiter&gt;ff..."
2690    */
bytesToHexString(final byte[] bytes, Character delimiter)2691   public static String bytesToHexString(final byte[] bytes,
2692       Character delimiter) {
2693     StringBuilder hex =
2694       new StringBuilder(bytes.length * (delimiter == null ? 2 : 3));
2695     int nibble1, nibble2;
2696     for (int i = 0; i < bytes.length; i++) {
2697       nibble1 = (bytes[i] >>> 4) & 0xf;
2698       nibble2 = bytes[i] & 0xf;
2699       if (i > 0 && delimiter != null) { hex.append(delimiter.charValue()); }
2700       hex.append(HEX_CHARS[nibble1]);
2701       hex.append(HEX_CHARS[nibble2]);
2702     }
2703     return hex.toString();
2704   }
2705 
2706   /**
2707    * Safely convert the string to uppercase.
2708    * @return upper case representation of the String; or null if
2709    * the input string is null.
2710    */
toUpperCase(String src)2711   public static String toUpperCase(String src) {
2712     if (src == null) {
2713       return null;
2714     } else {
2715       return src.toUpperCase();
2716     }
2717   }
2718 
2719   /**
2720    * Safely convert the string to lowercase.
2721    * @return lower case representation of the String; or null if
2722    * the input string is null.
2723    */
toLowerCase(String src)2724   public static String toLowerCase(String src) {
2725     if (src == null) {
2726       return null;
2727     } else {
2728       return src.toLowerCase();
2729     }
2730   }
2731 
2732   private static final Pattern dbSpecPattern =
2733       Pattern.compile("(.*)\\{(\\d+),(\\d+)\\}(.*)");
2734 
2735   /**
2736    * @param dbSpecComponent a single component of a DBDescriptor spec
2737    * (e.g. the host or database component). The expected format of the string is:
2738    * <br>
2739    *             <center>(prefix){(digits),(digits)}(suffix)</center>
2740    * </br>
2741    * @return a shard expansion of the given String.
2742    * Note that unless the pattern is matched exactly, no expansion is
2743    * performed and the original string is returned unaltered.
2744    * For example, 'db{0,1}.adz' is expanded into 'db0.adz, db1.adz'.
2745    * Note that this method is added to StringUtil instead of
2746    * DBDescriptor to better encapsulate the choice of regexp implementation.
2747    * @throws IllegalArgumentException if the string does not parse.
2748    */
expandShardNames(String dbSpecComponent)2749   public static String expandShardNames(String dbSpecComponent)
2750       throws IllegalArgumentException, IllegalStateException {
2751 
2752     Matcher matcher = dbSpecPattern.matcher(dbSpecComponent);
2753     if (matcher.find()) {
2754       try {
2755         String prefix = dbSpecComponent.substring(
2756           matcher.start(1), matcher.end(1));
2757         int minShard =
2758           Integer.parseInt(
2759             dbSpecComponent.substring(
2760               matcher.start(2), matcher.end(2)));
2761         int maxShard =
2762           Integer.parseInt(
2763             dbSpecComponent.substring(
2764               matcher.start(3), matcher.end(3)));
2765         String suffix = dbSpecComponent.substring(
2766           matcher.start(4), matcher.end(4));
2767         //Log2.logEvent(prefix + " " + minShard + " " + maxShard + " " + suffix);
2768         if (minShard > maxShard) {
2769           throw new IllegalArgumentException(
2770             "Maximum shard must be greater than or equal to " +
2771             "the minimum shard");
2772         }
2773         StringBuilder tmp = new StringBuilder();
2774         for (int shard = minShard; shard <= maxShard; shard++) {
2775           tmp.append(prefix).append(shard).append(suffix);
2776           if (shard != maxShard) {
2777             tmp.append(",");
2778           }
2779         }
2780         return tmp.toString();
2781       } catch (NumberFormatException nfex) {
2782         throw new IllegalArgumentException(
2783           "Malformed DB specification component: " + dbSpecComponent);
2784       }
2785     } else {
2786       return dbSpecComponent;
2787     }
2788   }
2789 
2790 
2791   /**
2792   * Returns a string that is equivalent to the specified string with its
2793   * first character converted to uppercase as by {@link String#toUpperCase()}.
2794   * The returned string will have the same value as the specified string if
2795   * its first character is non-alphabetic, if its first character is already
2796   * uppercase, or if the specified string is of length 0.
2797   *
2798   * <p>For example:
2799   * <pre>
2800   *    capitalize("foo bar").equals("Foo bar");
2801   *    capitalize("2b or not 2b").equals("2b or not 2b")
2802   *    capitalize("Foo bar").equals("Foo bar");
2803   *    capitalize("").equals("");
2804   * </pre>
2805   *
2806   * @param s the string whose first character is to be uppercased
2807   * @return a string equivalent to <tt>s</tt> with its first character
2808   *     converted to uppercase
2809   * @throws NullPointerException if <tt>s</tt> is null
2810   */
capitalize(String s)2811   public static String capitalize(String s) {
2812     if (s.length() == 0) {
2813       return s;
2814     }
2815     char first = s.charAt(0);
2816     char capitalized = Character.toUpperCase(first);
2817     return (first == capitalized)
2818         ? s
2819         : capitalized + s.substring(1);
2820   }
2821 
2822   /**
2823    * Examine a string to see if it starts with a given prefix (case
2824    * insensitive). Just like String.startsWith() except doesn't
2825    * respect case. Strings are compared in the same way as in
2826    * {@link String#equalsIgnoreCase}.
2827    *
2828    * @param str the string to examine
2829    * @param prefix the prefix to look for
2830    * @return a boolean indicating if str starts with prefix (case insensitive)
2831    */
startsWithIgnoreCase(String str, String prefix)2832   public static boolean startsWithIgnoreCase(String str, String prefix) {
2833     return str.regionMatches(true, 0, prefix, 0, prefix.length());
2834   }
2835 
2836   /**
2837    * Examine a string to see if it ends with a given suffix (case
2838    * insensitive). Just like String.endsWith() except doesn't respect
2839    * case. Strings are compared in the same way as in
2840    * {@link String#equalsIgnoreCase}.
2841    *
2842    * @param str the string to examine
2843    * @param suffix the suffix to look for
2844    * @return a boolean indicating if str ends with suffix (case insensitive)
2845    */
endsWithIgnoreCase(String str, String suffix)2846   public static boolean endsWithIgnoreCase(String str, String suffix) {
2847     int len = suffix.length();
2848     return str.regionMatches(true, str.length() - len, suffix, 0, len);
2849   }
2850 
2851   /**
2852    * @param c one codePoint
2853    * @return the number of bytes needed to encode this codePoint in UTF-8
2854    */
bytesUtf8(int c)2855   private static int bytesUtf8(int c) {
2856     if (c < 0x80) {
2857       return 1;
2858     } else if (c < 0x00800) {
2859       return 2;
2860     } else if (c < 0x10000) {
2861       return 3;
2862     } else if (c < 0x200000) {
2863       return 4;
2864 
2865     // RFC 3629 forbids the use of UTF-8 for codePoint greater than 0x10FFFF,
2866     // so if the caller respects this RFC, this should not happen
2867     } else if (c < 0x4000000) {
2868       return 5;
2869     } else {
2870       return 6;
2871     }
2872   }
2873 
2874   /**
2875    * @param str a string
2876    * @return the number of bytes required to represent this string in UTF-8
2877    */
bytesStorage(String str)2878   public static int bytesStorage(String str) {
2879     // offsetByCodePoint has a bug if its argument is the result of a
2880     // call to substring. To avoid this, we create a new String
2881     // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
2882     String s = new String(str);
2883 
2884     int len = 0;
2885     for (int i = 0; i < s.length(); i = s.offsetByCodePoints(i, 1)) {
2886       len += bytesUtf8(s.codePointAt(i));
2887     }
2888     return len;
2889   }
2890 
2891   /**
2892    * @param str a string
2893    * @param maxbytes
2894    * @return the beginning of the string, so that it uses less than
2895    *     maxbytes bytes in UTF-8
2896    * @throws IndexOutOfBoundsException if maxbytes is negative
2897    */
truncateStringForUtf8Storage(String str, int maxbytes)2898   public static String truncateStringForUtf8Storage(String str, int maxbytes) {
2899     if (maxbytes < 0) {
2900       throw new IndexOutOfBoundsException();
2901     }
2902 
2903     // offsetByCodePoint has a bug if its argument is the result of a
2904     // call to substring. To avoid this, we create a new String
2905     // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
2906     // TODO(cquinn): should be fixed as of 1.5.0_01
2907     String s = new String(str);
2908 
2909     int codepoints = 0;
2910     int bytesUsed = 0;
2911     for (codepoints = 0; codepoints < s.length();
2912         codepoints = s.offsetByCodePoints(codepoints, 1)) {
2913       int glyphBytes = StringUtil.bytesUtf8(s.codePointAt(codepoints));
2914       if (bytesUsed + glyphBytes > maxbytes) {
2915         break;
2916       }
2917       bytesUsed += glyphBytes;
2918     }
2919     return s.substring(0, codepoints);
2920   }
2921 
2922   /**
2923    * If the given string is of length {@code maxLength} or less, then it is
2924    * returned as is.
2925    * If the string is longer than {@code maxLength}, the returned string is
2926    * truncated before the last space character on or before
2927    * {@code source.charAt(maxLength)}. If the string has no spaces, the
2928    * returned string is truncated to {@code maxLength}.
2929    *
2930    * @param source the string to truncate if necessary
2931    * @param maxLength
2932    * @return the original string if its length is less than or equal to
2933    *     maxLength, otherwise a truncated string as mentioned above
2934    */
truncateIfNecessary(String source, int maxLength)2935   public static String truncateIfNecessary(String source, int maxLength) {
2936     if (source.length() <= maxLength) {
2937       return source;
2938     }
2939     String str = unicodePreservingSubstring(source, 0, maxLength);
2940 
2941     @SuppressWarnings("deprecation") // we'll make this go away before that does
2942     CharMatcher whitespaceMatcher = CharMatcher.LEGACY_WHITESPACE;
2943     String truncated = whitespaceMatcher.trimTrailingFrom(str);
2944 
2945     // We may have had multiple spaces at maxLength, which were stripped away
2946     if (truncated.length() < maxLength) {
2947       return truncated;
2948     }
2949     // We have a truncated string of length maxLength. If the next char was a
2950     // space, we truncated at a word boundary, so we can return immediately
2951     if (Character.isSpaceChar(source.charAt(maxLength))) {
2952       return truncated;
2953     }
2954     // We truncated in the middle of the word. Try to truncate before
2955     // the last space, if it exists. Otherwise, return the truncated string
2956     for (int i = truncated.length() - 1; i >= 0; --i) {
2957       if (Character.isSpaceChar(truncated.charAt(i))) {
2958         String substr = truncated.substring(0, i);
2959         return whitespaceMatcher.trimTrailingFrom(substr);
2960       }
2961     }
2962     return truncated;
2963   }
2964 
2965   /**
2966    * If this given string is of length {@code maxLength} or less, it will
2967    * be returned as-is.
2968    * Otherwise it will be trucated to {@code maxLength}, regardless of whether
2969    * there are any space characters in the String. If an ellipsis is requested
2970    * to be appended to the truncated String, the String will be truncated so
2971    * that the ellipsis will also fit within maxLength.
2972    * If no truncation was necessary, no ellipsis will be added.
2973    *
2974    * @param source the String to truncate if necessary
2975    * @param maxLength the maximum number of characters to keep
2976    * @param addEllipsis if true, and if the String had to be truncated,
2977    *     add "..." to the end of the String before returning. Additionally,
2978    *     the ellipsis will only be added if maxLength is greater than 3.
2979    * @return the original string if its length is less than or equal to
2980    *     maxLength, otherwise a truncated string as mentioned above
2981    */
truncateAtMaxLength(String source, int maxLength, boolean addEllipsis)2982   public static String truncateAtMaxLength(String source, int maxLength,
2983       boolean addEllipsis) {
2984 
2985     if (source.length() <= maxLength) {
2986       return source;
2987     }
2988     if (addEllipsis && maxLength > 3) {
2989       return unicodePreservingSubstring(source, 0, maxLength - 3) + "...";
2990     }
2991     return unicodePreservingSubstring(source, 0, maxLength);
2992   }
2993 
2994   /**
2995    * Normalizes {@code index} such that it respects Unicode character
2996    * boundaries in {@code str}.
2997    *
2998    * <p>If {@code index} is the low surrogate of a unicode character,
2999    * the method returns {@code index - 1}. Otherwise, {@code index} is
3000    * returned.
3001    *
3002    * <p>In the case in which {@code index} falls in an invalid surrogate pair
3003    * (e.g. consecutive low surrogates, consecutive high surrogates), or if
3004    * if it is not a valid index into {@code str}, the original value of
3005    * {@code index} is returned.
3006    *
3007    * @param str the String
3008    * @param index the index to be normalized
3009    * @return a normalized index that does not split a Unicode character
3010    */
unicodePreservingIndex(String str, int index)3011   public static int unicodePreservingIndex(String str, int index) {
3012     if (index > 0 && index < str.length()) {
3013       if (Character.isHighSurrogate(str.charAt(index - 1)) &&
3014           Character.isLowSurrogate(str.charAt(index))) {
3015         return index - 1;
3016       }
3017     }
3018     return index;
3019   }
3020 
3021   /**
3022    * Returns a substring of {@code str} that respects Unicode character
3023    * boundaries.
3024    *
3025    * <p>The string will never be split between a [high, low] surrogate pair,
3026    * as defined by {@link Character#isHighSurrogate} and
3027    * {@link Character#isLowSurrogate}.
3028    *
3029    * <p>If {@code begin} or {@code end} are the low surrogate of a unicode
3030    * character, it will be offset by -1.
3031    *
3032    * <p>This behavior guarantees that
3033    * {@code str.equals(StringUtil.unicodePreservingSubstring(str, 0, n) +
3034    *     StringUtil.unicodePreservingSubstring(str, n, str.length())) } is
3035    * true for all {@code n}.
3036    * </pre>
3037    *
3038    * <p>This means that unlike {@link String#substring(int, int)}, the length of
3039    * the returned substring may not necessarily be equivalent to
3040    * {@code end - begin}.
3041    *
3042    * @param str the original String
3043    * @param begin the beginning index, inclusive
3044    * @param end the ending index, exclusive
3045    * @return the specified substring, possibly adjusted in order to not
3046    *   split unicode surrogate pairs
3047    * @throws IndexOutOfBoundsException if the {@code begin} is negative,
3048    *   or {@code end} is larger than the length of {@code str}, or
3049    *   {@code begin} is larger than {@code end}
3050    */
unicodePreservingSubstring( String str, int begin, int end)3051   public static String unicodePreservingSubstring(
3052       String str, int begin, int end) {
3053     return str.substring(unicodePreservingIndex(str, begin),
3054         unicodePreservingIndex(str, end));
3055   }
3056 
3057   /**
3058    * Equivalent to:
3059    *
3060    * <pre>
3061    * {@link #unicodePreservingSubstring(String, int, int)}(
3062    *     str, begin, str.length())
3063    * </pre>
3064    */
unicodePreservingSubstring(String str, int begin)3065   public static String unicodePreservingSubstring(String str, int begin) {
3066     return unicodePreservingSubstring(str, begin, str.length());
3067   }
3068 
3069   /**
3070    * True iff the given character needs to be escaped in a javascript string
3071    * literal.
3072    * <p>
3073    * We need to escape the following characters in javascript string literals.
3074    * <dl>
3075    * <dt> \           <dd> the escape character
3076    * <dt> ', "        <dd> string delimiters.
3077    *                       TODO(msamuel): what about backticks (`) which are
3078    *                       non-standard but recognized as attribute delimiters.
3079    * <dt> &, <, >, =  <dd> so that a string literal can be embedded in XHTML
3080    *                       without further escaping.
3081    * </dl>
3082    * TODO(msamuel): If we're being paranoid, should we escape + to avoid UTF-7
3083    * attacks?
3084    * <p>
3085    * Unicode format control characters (category Cf) must be escaped since they
3086    * are removed by javascript parser in a pre-lex pass.
3087    * <br>According to EcmaScript 262 Section 7.1:
3088    * <blockquote>
3089    *     The format control characters can occur anywhere in the source text of
3090    *     an ECMAScript program. These characters are removed from the source
3091    *     text before applying the lexical grammar.
3092    * </blockquote>
3093    * <p>
3094    * Additionally, line terminators are not allowed to appear inside strings
3095    * and Section 7.3 says
3096    * <blockquote>
3097    *     The following characters are considered to be line terminators:<pre>
3098    *         Code Point Value   Name                  Formal Name
3099    *         \u000A             Line Feed             [LF]
3100    *         \u000D             Carriage Return       [CR]
3101    *         \u2028             Line separator        [LS]
3102    *         \u2029             Paragraph separator   [PS]
3103    * </pre></blockquote>
3104    *
3105    * @param codepoint a char instead of an int since the javascript language
3106    *    does not support extended unicode.
3107    */
mustEscapeCharInJsString(int codepoint)3108   static boolean mustEscapeCharInJsString(int codepoint) {
3109     return JS_ESCAPE_CHARS.contains(codepoint);
3110   }
3111 
3112   /**
3113    * True iff the given character needs to be escaped in a JSON string literal.
3114    * <p>
3115    * We need to escape the following characters in JSON string literals.
3116    * <dl>
3117    * <dt> \           <dd> the escape character
3118    * <dt> "           <dd> string delimiter
3119    * <dt> 0x00 - 0x1F <dd> control characters
3120    * </dl>
3121    * <p>
3122    * See EcmaScript 262 Section 15.12.1 for the full JSON grammar.
3123    */
mustEscapeCharInJsonString(int codepoint)3124   static boolean mustEscapeCharInJsonString(int codepoint) {
3125     return JSON_ESCAPE_CHARS.contains(codepoint);
3126   }
3127 
3128   /**
3129    * Builds a small set of code points.
3130    * {@code com.google.common.base} cannot depend on ICU4J, thus avoiding ICU's
3131    * {@code UnicodeSet}.
3132    * For all other purposes, please use {@code com.ibm.icu.text.UnicodeSet}.
3133    */
3134   private static class UnicodeSetBuilder {
3135     Set<Integer> codePointSet = new HashSet<Integer>();
3136 
addCodePoint(int c)3137     UnicodeSetBuilder addCodePoint(int c) {
3138       codePointSet.add(c);
3139       return this;
3140     }
3141 
addRange(int from, int to)3142     UnicodeSetBuilder addRange(int from, int to) {
3143       for (int i = from; i <= to; i++) {
3144         codePointSet.add(i);
3145       }
3146       return this;
3147     }
3148 
create()3149     Set<Integer> create() {
3150       return codePointSet;
3151     }
3152   }
3153 
3154   private static final Set<Integer> JS_ESCAPE_CHARS = new UnicodeSetBuilder()
3155       // All characters in the class of format characters, [:Cf:].
3156       // Source: http://unicode.org/cldr/utility/list-unicodeset.jsp.
3157       .addCodePoint(0xAD)
3158       .addRange(0x600, 0x603)
3159       .addCodePoint(0x6DD)
3160       .addCodePoint(0x070F)
3161       .addRange(0x17B4, 0x17B5)
3162       .addRange(0x200B, 0x200F)
3163       .addRange(0x202A, 0x202E)
3164       .addRange(0x2060, 0x2064)
3165       .addRange(0x206A, 0x206F)
3166       .addCodePoint(0xFEFF)
3167       .addRange(0xFFF9, 0xFFFB)
3168       .addRange(0x0001D173, 0x0001D17A)
3169       .addCodePoint(0x000E0001)
3170       .addRange(0x000E0020, 0x000E007F)
3171       // Plus characters mentioned in the docs of mustEscapeCharInJsString().
3172       .addCodePoint(0x0000)
3173       .addCodePoint(0x000A)
3174       .addCodePoint(0x000D)
3175       .addRange(0x2028, 0x2029)
3176       .addCodePoint(0x0085)
3177       .addCodePoint(Character.codePointAt("'", 0))
3178       .addCodePoint(Character.codePointAt("\"", 0))
3179       .addCodePoint(Character.codePointAt("&", 0))
3180       .addCodePoint(Character.codePointAt("<", 0))
3181       .addCodePoint(Character.codePointAt(">", 0))
3182       .addCodePoint(Character.codePointAt("=", 0))
3183       .addCodePoint(Character.codePointAt("\\", 0))
3184       .create();
3185 
3186   private static final Set<Integer> JSON_ESCAPE_CHARS = new UnicodeSetBuilder()
3187       .addCodePoint(Character.codePointAt("\"", 0))
3188       .addCodePoint(Character.codePointAt("\\", 0))
3189       .addRange(0x0000, 0x001F)
3190       .create();
3191 
3192   /**
3193    * <b>To be deprecated:</b> use {@link CharEscapers#xmlEscaper()} instead.
3194    */
xmlEscape(String s)3195   public static String xmlEscape(String s) {
3196     return CharEscapers.xmlEscaper().escape(s);
3197   }
3198 
3199   /**
3200    * <b>To be deprecated:</b> use {@link CharEscapers#asciiHtmlEscaper()} instead.
3201    */
htmlEscape(String s)3202   public static String htmlEscape(String s) {
3203     return CharEscapers.asciiHtmlEscaper().escape(s);
3204   }
3205 }