/** * Copyright (c) 2000, Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.android.mail.common.base; import static com.google.android.mail.common.base.Preconditions.checkArgument; import com.google.common.base.Joiner; import com.google.common.base.Joiner.MapJoiner; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Static utility methods and constants pertaining to {@code String} or {@code * CharSequence} instances. */ public final class StringUtil { private StringUtil() {} // COV_NF_LINE /** * A completely arbitrary selection of eight whitespace characters. See * this spreadsheet for more details * about whitespace characters. * * @deprecated Rewrite your code to use {@link CharMatcher#WHITESPACE}, or * consider the precise set of characters you want to match and construct * the right explicit {@link CharMatcher} or {@link String} for your own * purposes. */ @Deprecated public static final String WHITE_SPACES = " \r\n\t\u3000\u00A0\u2007\u202F"; /** A string containing the carriage return and linefeed characters. */ public static final String LINE_BREAKS = "\r\n"; /** * Old location of {@link Strings#isNullOrEmpty}; this method will be * deprecated soon. */ public static boolean isEmpty(String string) { return Strings.isNullOrEmpty(string); } /** * Returns {@code true} if the given string is null, empty, or comprises only * whitespace characters, as defined by {@link CharMatcher#WHITESPACE}. * *
Warning: there are many competing definitions of "whitespace";
* please see this spreadsheet for
* details.
*
* @param string the string reference to check
* @return {@code true} if {@code string} is null, empty, or consists of
* whitespace characters only
*/
public static boolean isEmptyOrWhitespace(String string) {
return string == null || CharMatcher.WHITESPACE.matchesAllOf(string);
}
/**
* Old location of {@link Strings#nullToEmpty}; this method will be
* deprecated soon.
*/
public static String makeSafe(String string) {
return Strings.nullToEmpty(string);
}
/**
* Old location of {@link Strings#emptyToNull}; this method will be
* deprecated soon.
*/
public static String toNullIfEmpty(String string) {
return Strings.emptyToNull(string);
}
/**
* Returns the given string if it is nonempty and contains at least one
* non-whitespace character; {@code null} otherwise. See comment in {@link
* #isEmptyOrWhitespace} on the definition of whitespace.
*
* @param string the string to test and possibly return
* @return {@code null} if {@code string} is null, empty, or contains only
* whitespace characters; {@code string} itself otherwise
*/
public static String toNullIfEmptyOrWhitespace(
String string) {
return isEmptyOrWhitespace(string) ? null : string;
}
/**
* Old location of {@link Strings#repeat}; this method will be deprecated
* soon.
*/
public static String repeat(String string, int count) {
return Strings.repeat(string, count);
}
/**
* Return the first index in the string of any of the specified characters,
* starting at a given index, or {@code -1} if none of the characters is
* present.
*
* @param string the non-null character sequence to look in
* @param chars a non-null character sequence containing the set of characters
* to look for. If empty, this method will find no matches and return
* {@code -1}
* @param fromIndex the index of the first character to examine in the input
* string. If negative, the entire string will be searched. If greater
* than or equal to the string length, no characters will be searched and
* {@code -1} will be returned.
* @return the index of the first match, or {@code -1} if no match was found.
* Guaranteed to be either {@code -1} or a number greater than or equal to
* {@code fromIndex}
* @throws NullPointerException if any argument is null
*/
// author: pault
public static int indexOfChars(
CharSequence string, CharSequence chars, int fromIndex) {
if (fromIndex >= string.length()) {
return -1;
}
/*
* Prepare lookup structures for the characters. TODO(pault): This loop
* could be factored into another method to allow caching of the resulting
* struct if a use-case of very large character sets exists.
*/
Set Only breaking whitespace characters (those which match
* {@link CharMatcher#BREAKING_WHITESPACE}) are treated as whitespace by
* this method. Non-breaking whitespace characters will be considered as
* ordinary characters which are connected to any other adjacent
* non-whitespace characters, and will therefore appear in the returned
* string in their original context.
*
* @param lines array of lines to format
* @param width the fixed width (in characters)
*/
public static String fixedWidth(String[] lines, int width) {
List Note: If {@code fromIndex} is zero, use {@link CharMatcher}
* instead for this: {@code CharMatcher.noneOf(chars).lastIndexIn(str)}.
*/
// TODO(kevinb): after adding fromIndex versions of (last)IndexOf to
// CharMatcher, deprecate this
public static int lastIndexNotOf(String str, String chars, int fromIndex) {
fromIndex = Math.min(fromIndex, str.length() - 1);
for (int pos = fromIndex; pos >= 0; pos--) {
if (chars.indexOf(str.charAt(pos)) < 0) {
return pos;
}
}
return -1;
}
/**
* Like String.replace() except that it accepts any number of old chars.
* Replaces any occurrances of 'oldchars' in 'str' with 'newchar'.
* Example: replaceChars("Hello, world!", "H,!", ' ') returns " ello world "
*
* @deprecated use {@code CharMatcher#replaceFrom(String, char)}, for example
* {@code CharMatcher.anyOf(oldchars).replaceFrom(str, newchar)}
*/
@Deprecated public static String replaceChars(
String str, CharSequence oldchars, char newchar) {
return CharMatcher.anyOf(oldchars).replaceFrom(str, newchar);
}
/**
* Remove any occurrances of 'oldchars' in 'str'.
* Example: removeChars("Hello, world!", ",!") returns "Hello world"
*
* @deprecated use {@link CharMatcher#removeFrom(CharSequence)}, for example
* {@code CharMatcher.anyOf(oldchars).removeFrom(str)}
*/
@Deprecated public static String removeChars(
String str, CharSequence oldchars) {
return CharMatcher.anyOf(oldchars).removeFrom(str);
}
// See http://www.microsoft.com/typography/unicode/1252.htm
private static final CharMatcher FANCY_SINGLE_QUOTE
= CharMatcher.anyOf("\u0091\u0092\u2018\u2019");
private static final CharMatcher FANCY_DOUBLE_QUOTE
= CharMatcher.anyOf("\u0093\u0094\u201c\u201d");
/**
* Replaces microsoft "smart quotes" (curly " and ') with their
* ascii counterparts.
*/
public static String replaceSmartQuotes(String str) {
String tmp = FANCY_SINGLE_QUOTE.replaceFrom(str, '\'');
return FANCY_DOUBLE_QUOTE.replaceFrom(tmp, '"');
}
/**
* Convert a string of hex digits to a byte array, with the first
* byte in the array being the MSB. The string passed in should be
* just the raw digits (upper or lower case), with no leading
* or trailing characters (like '0x' or 'h').
* An odd number of characters is supported.
* If the string is empty, an empty array will be returned.
*
* This is significantly faster than using
* new BigInteger(str, 16).toByteArray();
* especially with larger strings. Here are the results of some
* microbenchmarks done on a P4 2.8GHz 2GB RAM running
* linux 2.4.22-gg11 and JDK 1.5 with an optimized build:
*
* String length hexToBytes (usec) BigInteger
* -----------------------------------------------------
* 16 0.570 1.43
* 256 8.21 44.4
* 1024 32.8 526
* 16384 546 121000
*/
public static byte[] hexToBytes(CharSequence str) {
byte[] bytes = new byte[(str.length() + 1) / 2];
if (str.length() == 0) {
return bytes;
}
bytes[0] = 0;
int nibbleIdx = (str.length() % 2);
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (!isHex(c)) {
throw new IllegalArgumentException("string contains non-hex chars");
}
if ((nibbleIdx % 2) == 0) {
bytes[nibbleIdx >> 1] = (byte) (hexValue(c) << 4);
} else {
bytes[nibbleIdx >> 1] += (byte) hexValue(c);
}
nibbleIdx++;
}
return bytes;
}
/**
* Converts any instances of "\r" or "\r\n" style EOLs into "\n" (Line Feed).
*/
public static String convertEOLToLF(String input) {
StringBuilder res = new StringBuilder(input.length());
char[] s = input.toCharArray();
int from = 0;
final int end = s.length;
for (int i = 0; i < end; i++) {
if (s[i] == '\r') {
res.append(s, from, i - from);
res.append('\n');
if (i + 1 < end && s[i + 1] == '\n') {
i++;
}
from = i + 1;
}
}
if (from == 0) { // no \r!
return input;
}
res.append(s, from, end - from);
return res.toString();
}
/**
* Old location of {@link Strings#padStart}; this method will be deprecated
* soon.
*/
public static String padLeft(String s, int len, char padChar) {
return Strings.padStart(s, len, padChar);
}
/**
* Old location of {@link Strings#padEnd}; this method will be deprecated
* soon.
*/
public static String padRight(String s, int len, char padChar) {
return Strings.padEnd(s, len, padChar);
}
/**
* Returns a string consisting of "s", with each of the first "len" characters
* replaced by "maskChar" character.
*/
public static String maskLeft(String s, int len, char maskChar) {
if (len <= 0) {
return s;
}
len = Math.min(len, s.length());
StringBuilder sb = new StringBuilder();
for (int i = 0; i < len; i++) {
sb.append(maskChar);
}
sb.append(s.substring(len));
return sb.toString();
}
private static boolean isOctal(char c) {
return (c >= '0') && (c <= '7');
}
private static boolean isHex(char c) {
return ((c >= '0') && (c <= '9')) ||
((c >= 'a') && (c <= 'f')) ||
((c >= 'A') && (c <= 'F'));
}
private static int hexValue(char c) {
if ((c >= '0') && (c <= '9')) {
return (c - '0');
} else if ((c >= 'a') && (c <= 'f')) {
return (c - 'a') + 10;
} else {
return (c - 'A') + 10;
}
}
/**
* Unescape any C escape sequences (\n, \r, \\, \ooo, etc) and return the
* resulting string.
*/
public static String unescapeCString(String s) {
if (s.indexOf('\\') < 0) {
// Fast path: nothing to unescape
return s;
}
StringBuilder sb = new StringBuilder();
int len = s.length();
for (int i = 0; i < len;) {
char c = s.charAt(i++);
if (c == '\\' && (i < len)) {
c = s.charAt(i++);
switch (c) {
case 'a': c = '\007'; break;
case 'b': c = '\b'; break;
case 'f': c = '\f'; break;
case 'n': c = '\n'; break;
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'v': c = '\013'; break;
case '\\': c = '\\'; break;
case '?': c = '?'; break;
case '\'': c = '\''; break;
case '"': c = '\"'; break;
default: {
if ((c == 'x') && (i < len) && isHex(s.charAt(i))) {
// "\xXX"
int v = hexValue(s.charAt(i++));
if ((i < len) && isHex(s.charAt(i))) {
v = v * 16 + hexValue(s.charAt(i++));
}
c = (char) v;
} else if (isOctal(c)) {
// "\OOO"
int v = (c - '0');
if ((i < len) && isOctal(s.charAt(i))) {
v = v * 8 + (s.charAt(i++) - '0');
}
if ((i < len) && isOctal(s.charAt(i))) {
v = v * 8 + (s.charAt(i++) - '0');
}
c = (char) v;
} else {
// Propagate unknown escape sequences.
sb.append('\\');
}
break;
}
}
}
sb.append(c);
}
return sb.toString();
}
/**
* Unescape any MySQL escape sequences.
* See MySQL language reference Chapter 6 at
* http://www.mysql.com/doc/.
* This function will not work for other SQL-like
* dialects.
* @param s string to unescape, with the surrounding quotes.
* @return unescaped string, without the surrounding quotes.
* @exception IllegalArgumentException if s is not a valid MySQL string.
*/
public static String unescapeMySQLString(String s)
throws IllegalArgumentException {
// note: the same buffer is used for both reading and writing
// it works because the writer can never outrun the reader
char chars[] = s.toCharArray();
// the string must be quoted 'like this' or "like this"
if (chars.length < 2 || chars[0] != chars[chars.length - 1] ||
(chars[0] != '\'' && chars[0] != '"')) {
throw new IllegalArgumentException("not a valid MySQL string: " + s);
}
// parse the string and decode the backslash sequences; in addition,
// quotes can be escaped 'like this: ''', "like this: """, or 'like this: "'
int j = 1; // write position in the string (never exceeds read position)
int f = 0; // state: 0 (normal), 1 (backslash), 2 (quote)
for (int i = 1; i < chars.length - 1; i++) {
if (f == 0) { // previous character was normal
if (chars[i] == '\\') {
f = 1; // backslash
} else if (chars[i] == chars[0]) {
f = 2; // quoting character
} else {
chars[j++] = chars[i];
}
} else if (f == 1) { // previous character was a backslash
switch (chars[i]) {
case '0': chars[j++] = '\0'; break;
case '\'': chars[j++] = '\''; break;
case '"': chars[j++] = '"'; break;
case 'b': chars[j++] = '\b'; break;
case 'n': chars[j++] = '\n'; break;
case 'r': chars[j++] = '\r'; break;
case 't': chars[j++] = '\t'; break;
case 'z': chars[j++] = '\032'; break;
case '\\': chars[j++] = '\\'; break;
default:
// if the character is not special, backslash disappears
chars[j++] = chars[i];
break;
}
f = 0;
} else { // previous character was a quote
// quoting characters must be doubled inside a string
if (chars[i] != chars[0]) {
throw new IllegalArgumentException("not a valid MySQL string: " + s);
}
chars[j++] = chars[0];
f = 0;
}
}
// string contents cannot end with a special character
if (f != 0) {
throw new IllegalArgumentException("not a valid MySQL string: " + s);
}
// done
return new String(chars, 1, j - 1);
}
// TODO(pbarry): move all HTML methods to common.html package
static final Map
* Replace all the occurences of HTML escape strings with the
* respective characters.
*
* The default mode is strict (requiring semicolons).
*
*
* This is a more general version of collapseWhitespace.
*
* Note that this method uses the default platform encoding, and expects
* that encoding to be single-byte, which is not always the case. Its use
* is discouraged. For reading the entire stream (maxLength == -1) you can use:
* For maxLength >= 0 a literal translation would be
* For example:
* If {@code index} is the low surrogate of a unicode character,
* the method returns {@code index - 1}. Otherwise, {@code index} is
* returned.
*
* In the case in which {@code index} falls in an invalid surrogate pair
* (e.g. consecutive low surrogates, consecutive high surrogates), or if
* if it is not a valid index into {@code str}, the original value of
* {@code index} is returned.
*
* @param str the String
* @param index the index to be normalized
* @return a normalized index that does not split a Unicode character
*/
public static int unicodePreservingIndex(String str, int index) {
if (index > 0 && index < str.length()) {
if (Character.isHighSurrogate(str.charAt(index - 1)) &&
Character.isLowSurrogate(str.charAt(index))) {
return index - 1;
}
}
return index;
}
/**
* Returns a substring of {@code str} that respects Unicode character
* boundaries.
*
* The string will never be split between a [high, low] surrogate pair,
* as defined by {@link Character#isHighSurrogate} and
* {@link Character#isLowSurrogate}.
*
* If {@code begin} or {@code end} are the low surrogate of a unicode
* character, it will be offset by -1.
*
* This behavior guarantees that
* {@code str.equals(StringUtil.unicodePreservingSubstring(str, 0, n) +
* StringUtil.unicodePreservingSubstring(str, n, str.length())) } is
* true for all {@code n}.
*
*
* This means that unlike {@link String#substring(int, int)}, the length of
* the returned substring may not necessarily be equivalent to
* {@code end - begin}.
*
* @param str the original String
* @param begin the beginning index, inclusive
* @param end the ending index, exclusive
* @return the specified substring, possibly adjusted in order to not
* split unicode surrogate pairs
* @throws IndexOutOfBoundsException if the {@code begin} is negative,
* or {@code end} is larger than the length of {@code str}, or
* {@code begin} is larger than {@code end}
*/
public static String unicodePreservingSubstring(
String str, int begin, int end) {
return str.substring(unicodePreservingIndex(str, begin),
unicodePreservingIndex(str, end));
}
/**
* Equivalent to:
*
*
* We need to escape the following characters in javascript string literals.
*
* Unicode format control characters (category Cf) must be escaped since they
* are removed by javascript parser in a pre-lex pass.
*
* Additionally, line terminators are not allowed to appear inside strings
* and Section 7.3 says
*
* We need to escape the following characters in JSON string literals.
*
* See EcmaScript 262 Section 15.12.1 for the full JSON grammar.
*/
static boolean mustEscapeCharInJsonString(int codepoint) {
return JSON_ESCAPE_CHARS.contains(codepoint);
}
/**
* Builds a small set of code points.
* {@code com.google.common.base} cannot depend on ICU4J, thus avoiding ICU's
* {@code UnicodeSet}.
* For all other purposes, please use {@code com.ibm.icu.text.UnicodeSet}.
*/
private static class UnicodeSetBuilder {
Set
*
*
* @param str the string to split. Must not be null.
* @param delims the delimiter characters. Each character in the string
* is individually treated as a delimiter.
* @param trimTokens if true, leading/trailing whitespace is removed
* from the tokens
* @return an array of tokens. Will not return null.
* @deprecated
*/
@Deprecated
public static String[] split(
String str, String delims, boolean trimTokens) {
StringTokenizer tokenizer = new StringTokenizer(str, delims);
int n = tokenizer.countTokens();
String[] list = new String[n];
for (int i = 0; i < n; i++) {
if (trimTokens) {
list[i] = tokenizer.nextToken().trim();
} else {
list[i] = tokenizer.nextToken();
}
}
return list;
}
/**
* Trim characters from only the beginning of a string.
* This is a convenience method, it simply calls trimStart(s, null).
*
* @param s String to be trimmed
* @return String with whitespace characters removed from the beginning
*/
public static String trimStart(String s) {
return trimStart(s, null);
}
/**
* Trim characters from only the beginning of a string.
* This method will remove all whitespace characters
* (defined by Character.isWhitespace(char), in addition to the characters
* provided, from the end of the provided string.
*
* @param s String to be trimmed
* @param extraChars Characters in addition to whitespace characters that
* should be trimmed. May be null.
* @return String with whitespace and characters in extraChars removed
* from the beginning
*/
public static String trimStart(String s, String extraChars) {
int trimCount = 0;
while (trimCount < s.length()) {
char ch = s.charAt(trimCount);
if (Character.isWhitespace(ch)
|| (extraChars != null && extraChars.indexOf(ch) >= 0)) {
trimCount++;
} else {
break;
}
}
if (trimCount == 0) {
return s;
}
return s.substring(trimCount);
}
/**
* Trim characters from only the end of a string.
* This is a convenience method, it simply calls trimEnd(s, null).
*
* @param s String to be trimmed
* @return String with whitespace characters removed from the end
*/
public static String trimEnd(String s) {
return trimEnd(s, null);
}
/**
* Trim characters from only the end of a string.
* This method will remove all whitespace characters
* (defined by Character.isWhitespace(char), in addition to the characters
* provided, from the end of the provided string.
*
* @param s String to be trimmed
* @param extraChars Characters in addition to whitespace characters that
* should be trimmed. May be null.
* @return String with whitespace and characters in extraChars removed
* from the end
*/
public static String trimEnd(String s, String extraChars) {
int trimCount = 0;
while (trimCount < s.length()) {
char ch = s.charAt(s.length() - trimCount - 1);
if (Character.isWhitespace(ch)
|| (extraChars != null && extraChars.indexOf(ch) >= 0)) {
trimCount++;
} else {
break;
}
}
if (trimCount == 0) {
return s;
}
return s.substring(0, s.length() - trimCount);
}
/**
* @param str the string to split. Must not be null.
* @param delims the delimiter characters. Each character in the
* string is individually treated as a delimiter.
* @return an array of tokens. Will not return null. Leading/trailing
* whitespace is removed from the tokens.
* @deprecated see the detailed instructions under
* {@link #split(String, String, boolean)}
*/
@Deprecated
public static String[] splitAndTrim(String str, String delims) {
return split(str, delims, true);
}
/** Parse comma-separated list of ints and return as array. */
public static int[] splitInts(String str) throws IllegalArgumentException {
StringTokenizer tokenizer = new StringTokenizer(str, ",");
int n = tokenizer.countTokens();
int[] list = new int[n];
for (int i = 0; i < n; i++) {
String token = tokenizer.nextToken();
list[i] = Integer.parseInt(token);
}
return list;
}
/** Parse comma-separated list of longs and return as array. */
public static long[] splitLongs(String str) throws IllegalArgumentException {
StringTokenizer tokenizer = new StringTokenizer(str, ",");
int n = tokenizer.countTokens();
long[] list = new long[n];
for (int i = 0; i < n; i++) {
String token = tokenizer.nextToken();
list[i] = Long.parseLong(token);
}
return list;
}
/** This replaces the occurrences of 'what' in 'str' with 'with'
*
* @param str the string to process
* @param what to replace
* @param with replace with this
* @return String str where 'what' was replaced with 'with'
*
* @deprecated Please use {@link String#replace(CharSequence, CharSequence)}.
*/
@Deprecated
public static String replace(
String str, CharSequence what, CharSequence with) {
// Have to check this argument, for compatibility with the old impl.
// For the record, String.replace() is capable of handling an empty target
// string... but it does something kind of weird in that case.
checkArgument(what.length() > 0);
return str.replace(what, with);
}
private static final Splitter NEWLINE_SPLITTER =
Splitter.on('\n').omitEmptyStrings();
/**
* Reformats the given string to a fixed width by inserting carriage returns
* and trimming unnecessary whitespace. See
* {@link #fixedWidth(String[], int)} for details. The {@code str} argument
* to this method will be split on newline characters ({@code '\n'}) only
* (regardless of platform). An array of resulting non-empty strings is
* then passed to {@link #fixedWidth(String[], int)} as the {@code lines}
* parameter.
*
* @param str the string to format
* @param width the fixed width (in characters)
*/
public static String fixedWidth(String str, int width) {
List
*
*/
@Deprecated
public static String megastrip(String str,
boolean left, boolean right,
String what) {
if (str == null) {
return null;
}
CharMatcher matcher = CharMatcher.anyOf(what);
if (left) {
if (right) {
return matcher.trimFrom(str);
}
return matcher.trimLeadingFrom(str);
}
if (right) {
return matcher.trimTrailingFrom(str);
}
return str;
}
/** strip - strips both ways
*
* @param str what to strip
* @return String the striped string
* @deprecated ensure the string is not null and use {@code
* CharMatcher.LEGACY_WHITESPACE.trimFrom(str)}; also consider whether you
* really want the legacy whitespace definition, or something more
* standard like {@link CharMatcher#WHITESPACE}.
*/
@SuppressWarnings("deprecation") // this is deprecated itself
@Deprecated public static String strip(String str) {
return (str == null) ? null : CharMatcher.LEGACY_WHITESPACE.trimFrom(str);
}
/** Strip white spaces from both end, and collapse white spaces
* in the middle.
*
* @param str what to strip
* @return String the striped and collapsed string
* @deprecated ensure the string is not null and use {@code
* CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ')}; also
* consider whether you really want the legacy whitespace definition, or
* something more standard like {@link CharMatcher#WHITESPACE}.
*/
@SuppressWarnings("deprecation") // this is deprecated itself
@Deprecated public static String stripAndCollapse(String str) {
return (str == null) ? null
: CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ');
}
/**
* Give me a string and a potential prefix, and I return the string
* following the prefix if the prefix matches, else null.
* Analogous to the c++ functions strprefix and var_strprefix.
*
* @param str the string to strip
* @param prefix the expected prefix
* @return the stripped string or null
if the string
* does not start with the prefix
*/
public static String stripPrefix(String str, String prefix) {
return str.startsWith(prefix)
? str.substring(prefix.length())
: null;
}
/**
* Case insensitive version of stripPrefix. Strings are compared in
* the same way as in {@link String#equalsIgnoreCase}.
* Analogous to the c++ functions strcaseprefix and var_strcaseprefix.
*
* @param str the string to strip
* @param prefix the expected prefix
* @return the stripped string or null
if the string
* does not start with the prefix
*/
public static String stripPrefixIgnoreCase(String str, String prefix) {
return startsWithIgnoreCase(str, prefix)
? str.substring(prefix.length())
: null;
}
/**
* Give me a string and a potential suffix, and I return the string
* before the suffix if the suffix matches, else null.
* Analogous to the c++ function strsuffix.
*
* @param str the string to strip
* @param suffix the expected suffix
* @return the stripped string or null
if the string
* does not end with the suffix
*/
public static String stripSuffix(String str, String suffix) {
return str.endsWith(suffix)
? str.substring(0, str.length() - suffix.length())
: null;
}
/**
* Case insensitive version of stripSuffix. Strings are compared in
* the same way as in {@link String#equalsIgnoreCase}.
* Analogous to the c++ function strcasesuffix.
*
* @param str the string to strip
* @param suffix the expected suffix
* @return the stripped string or null
if the string
* does not end with the suffix
*/
public static String stripSuffixIgnoreCase(
String str, String suffix) {
return endsWithIgnoreCase(str, suffix)
? str.substring(0, str.length() - suffix.length())
: null;
}
/**
* Strips all non-digit characters from a string.
*
* The resulting string will only contain characters for which isDigit()
* returns true.
*
* @param str the string to strip
* @return a string consisting of digits only, or an empty string
* @deprecated use {@code CharMatcher.JAVA_DIGIT.retainFrom(str)} (also
* consider whether this is really the definition of "digit" you wish to
* use)
*/
@Deprecated public static String stripNonDigits(String str) {
return CharMatcher.JAVA_DIGIT.retainFrom(str);
}
/**
* Finds the last index in str of a character not in the characters
* in 'chars' (similar to ANSI string.find_last_not_of).
*
* Returns -1 if no such character can be found.
*
* String
value
* @return a String
value
* @throws NullPointerException if the input string is null.
*/
public static final String unescapeHTML(String s) {
return unescapeHTML(s, false);
}
/**
* Replace all the occurences of HTML escape strings with the
* respective characters.
*
* @param s a String
value
* @param emulateBrowsers a Boolean
value that tells the method
* to allow entity refs not terminated with a semicolon to be unescaped.
* (a quirk of this feature, and some browsers, is that an explicit
* terminating character is needed - e.g., <$ would be unescaped, but
* not <ab - see the tests for a more in-depth description of browsers)
* @return a String
value
* @throws NullPointerException if the input string is null.
*/
public static final String unescapeHTML(String s, boolean emulateBrowsers) {
// See if there are any '&' in the string since that is what we look
// for to escape. If there isn't, then we don't need to escape this string
// Based on similar technique used in the escape function.
int index = s.indexOf('&');
if (index == -1) {
// Nothing to escape. Return the original string.
return s;
}
// We found an escaped character. Start slow escaping from there.
char[] chars = s.toCharArray();
char[] escaped = new char[chars.length];
System.arraycopy(chars, 0, escaped, 0, index);
// Note: escaped[pos] = end of the escaped char array.
int pos = index;
for (int i = index; i < chars.length;) {
if (chars[i] != '&') {
escaped[pos++] = chars[i++];
continue;
}
// Allow e.g. {
int j = i + 1;
boolean isNumericEntity = false;
if (j < chars.length && chars[j] == '#') {
j++;
isNumericEntity = true;
}
// if it's numeric, also check for hex
boolean isHexEntity = false;
if (j < chars.length && (chars[j] == 'x' || chars[j] == 'X')) {
j++;
isHexEntity = true;
}
// Scan until we find a char that is not valid for this sequence.
for (; j < chars.length; j++) {
char ch = chars[j];
boolean isDigit = Character.isDigit(ch);
if (isNumericEntity) {
// non-hex numeric sequence end condition
if (!isHexEntity && !isDigit) {
break;
}
// hex sequence end contition
if (isHexEntity && !isDigit && !HEX_LETTERS.contains(ch)) {
break;
}
}
// anything other than a digit or letter is always an end condition
if (!isDigit && !Character.isLetter(ch)) {
break;
}
}
boolean replaced = false;
if ((j <= chars.length && emulateBrowsers) ||
(j < chars.length && chars[j] == ';')) {
// Check for D; and
pattern
if (i + 2 < chars.length && s.charAt(i + 1) == '#') {
try {
long charcode = 0;
char ch = s.charAt(i + 2);
if (isHexEntity) {
charcode = Long.parseLong(
new String(chars, i + 3, j - i - 3), 16);
} else if (Character.isDigit(ch)) {
charcode = Long.parseLong(
new String(chars, i + 2, j - i - 2));
}
// D800 to DFFF are for UTF16 surrogate pairs, and are not valid HTML entities
// Code points 0xFFFE and 0xFFFF are unicode noncharacters
if ((charcode > 0 && charcode < 0xD800) || (charcode > 0xDFFF && charcode < 0xFFFE)) {
escaped[pos++] = (char) charcode;
replaced = true;
} else if (charcode >= 0x10000 && charcode < 0x110000) {
// These characters are represented as surrogate pairs in UTF16
escaped[pos++] = (char) ((charcode - 0x10000) / 0x400 + 0xD800);
escaped[pos++] = (char) ((charcode - 0x10000) % 0x400 + 0xDC00);
replaced = true;
}
} catch (NumberFormatException ex) {
// Failed, not replaced.
}
} else {
String key = new String(chars, i, j - i);
Character repl = ESCAPE_STRINGS.get(key);
if (repl != null) {
escaped[pos++] = repl;
replaced = true;
}
}
// Skip over ';'
if (j < chars.length && chars[j] == ';') {
j++;
}
}
if (!replaced) {
// Not a recognized escape sequence, leave as-is
System.arraycopy(chars, i, escaped, pos, j - i);
pos += j - i;
}
i = j;
}
return new String(escaped, 0, pos);
}
// Escaper for < and > only.
private static final CharEscaper LT_GT_ESCAPE =
new CharEscaperBuilder()
.addEscape('<', "<")
.addEscape('>', ">")
.toEscaper();
private static final Pattern htmlTagPattern =
Pattern.compile("?[a-zA-Z][^>]*>");
/**
* Given a String
, returns an equivalent String
with
* all HTML tags stripped. Note that HTML entities, such as "&" will
* still be preserved.
*/
public static String stripHtmlTags(String string) {
if ((string == null) || "".equals(string)) {
return string;
}
String stripped = htmlTagPattern.matcher(string).replaceAll("");
/*
* Certain inputs result in a well-formed HTML:
* <
* Full escaping of unicode entites isn't required but this makes
* sure that unicode strings will survive regardless of the
* content-encoding of the javascript file which is important when
* we use this function to autogenerated javascript source files.
* This is disabled by default because it makes non-latin strings very long.
*
* If you seem to have trouble with character-encodings, maybe
* turn this on to see if the problem goes away. If so, you need
* to specify a character encoding for your javascript somewhere.
* @param jsEscapingMode determines the type of escaping to perform.
* @param out the buffer to append output to.
*/
/*
* To avoid fallthrough, we would have to either use a hybrid switch-case/if
* approach (which would obscure our special handling for ' and "), duplicate
* the content of the default case, or pass a half-dozen parameters to a
* helper method containing the code from the default case.
*/
@SuppressWarnings("fallthrough")
public static void escapeStringBody(
CharSequence plainText, boolean escapeToAscii,
JsEscapingMode jsEscapingMode, Appendable out)
throws IOException {
int pos = 0; // Index just past the last char in plainText written to out.
int len = plainText.length();
for (int codePoint, charCount, i = 0; i < len; i += charCount) {
codePoint = Character.codePointAt(plainText, i);
charCount = Character.charCount(codePoint);
if (!shouldEscapeChar(codePoint, escapeToAscii, jsEscapingMode)) {
continue;
}
out.append(plainText, pos, i);
pos = i + charCount;
switch (codePoint) {
case '\b': out.append("\\b"); break;
case '\t': out.append("\\t"); break;
case '\n': out.append("\\n"); break;
case '\f': out.append("\\f"); break;
case '\r': out.append("\\r"); break;
case '\\': out.append("\\\\"); break;
case '"': case '\'':
if (jsEscapingMode == JsEscapingMode.JSON && '\'' == codePoint) {
// JSON does not escape a single quote (and it should be surrounded
// by double quotes).
out.append((char) codePoint);
break;
} else if (jsEscapingMode != JsEscapingMode.EMBEDDABLE_JS) {
out.append('\\').append((char) codePoint);
break;
}
// fall through
default:
if (codePoint >= 0x100 || jsEscapingMode == JsEscapingMode.JSON) {
appendHexJavaScriptRepresentation(codePoint, out);
} else {
// Output the minimal octal encoding. We can't use an encoding
// shorter than three digits if the next digit is a valid octal
// digit.
boolean pad = i + charCount >= len
|| isOctal(plainText.charAt(i + charCount));
appendOctalJavaScriptRepresentation((char) codePoint, pad, out);
}
break;
}
}
out.append(plainText, pos, len);
}
/**
* Helper for escapeStringBody, which decides whether to escape a character.
*/
private static boolean shouldEscapeChar(int codePoint,
boolean escapeToAscii, JsEscapingMode jsEscapingMode) {
// If non-ASCII chars should be escaped, identify non-ASCII code points.
if (escapeToAscii && (codePoint < 0x20 || codePoint > 0x7e)) {
return true;
}
// If in JSON escaping mode, check JSON *and* JS escaping rules. The JS
// escaping rules will escape more characters than needed for JSON,
// but it is safe to escape any character in JSON.
// TODO(bbavar): Remove unnecessary escaping for JSON, as long as it can be
// shown that this change in legacy behavior is safe.
if (jsEscapingMode == JsEscapingMode.JSON) {
return mustEscapeCharInJsonString(codePoint)
|| mustEscapeCharInJsString(codePoint);
}
// Finally, just check the default JS escaping rules.
return mustEscapeCharInJsString(codePoint);
}
/**
* Returns a javascript representation of the character in a hex escaped
* format.
*
* @param codePoint The codepoint to append.
* @param out The buffer to which the hex representation should be appended.
*/
private static void appendHexJavaScriptRepresentation(
int codePoint, Appendable out)
throws IOException {
if (Character.isSupplementaryCodePoint(codePoint)) {
// Handle supplementary unicode values which are not representable in
// javascript. We deal with these by escaping them as two 4B sequences
// so that they will round-trip properly when sent from java to javascript
// and back.
char[] surrogates = Character.toChars(codePoint);
appendHexJavaScriptRepresentation(surrogates[0], out);
appendHexJavaScriptRepresentation(surrogates[1], out);
return;
}
out.append("\\u")
.append(HEX_CHARS[(codePoint >>> 12) & 0xf])
.append(HEX_CHARS[(codePoint >>> 8) & 0xf])
.append(HEX_CHARS[(codePoint >>> 4) & 0xf])
.append(HEX_CHARS[codePoint & 0xf]);
}
/**
* Returns a javascript representation of the character in a hex escaped
* format. Although this is a rather specific method, it is made public
* because it is also used by the JSCompiler.
*
* @param ch The character to append.
* @param pad true to force use of the full 3 digit representation.
* @param out The buffer to which the hex representation should be appended.
*/
private static void appendOctalJavaScriptRepresentation(
char ch, boolean pad, Appendable out) throws IOException {
if (ch >= 0100
// Be paranoid at the end of a string since someone might call
// this method again with another string segment.
|| pad) {
out.append('\\')
.append(OCTAL_CHARS[(ch >>> 6) & 0x7])
.append(OCTAL_CHARS[(ch >>> 3) & 0x7])
.append(OCTAL_CHARS[ch & 0x7]);
} else if (ch >= 010) {
out.append('\\')
.append(OCTAL_CHARS[(ch >>> 3) & 0x7])
.append(OCTAL_CHARS[ch & 0x7]);
} else {
out.append('\\')
.append(OCTAL_CHARS[ch & 0x7]);
}
}
/**
* Although this is a rather specific method, it is made public
* because it is also used by the JSCompiler.
*
* @see #appendHexJavaScriptRepresentation(int, Appendable)
*/
public static void appendHexJavaScriptRepresentation(StringBuilder sb,
char c) {
try {
appendHexJavaScriptRepresentation(c, sb);
} catch (IOException ex) {
// StringBuilder does not throw IOException.
throw new RuntimeException(ex);
}
}
/**
* Undo escaping as performed in javaScriptEscape(.)
* Throws an IllegalArgumentException if the string contains
* bad escaping.
*/
public static String javaScriptUnescape(String s) {
StringBuilder sb = new StringBuilder(s.length());
for (int i = 0; i < s.length(); ) {
char c = s.charAt(i);
if (c == '\\') {
i = javaScriptUnescapeHelper(s, i + 1, sb);
} else {
sb.append(c);
i++;
}
}
return sb.toString();
}
/**
* Looks for an escape code starting at index i of s,
* and appends it to sb.
* @return the index of the first character in s
* after the escape code.
* @throws IllegalArgumentException if the escape code
* is invalid
*/
private static int javaScriptUnescapeHelper(String s, int i,
StringBuilder sb) {
if (i >= s.length()) {
throw new IllegalArgumentException(
"End-of-string after escape character in [" + s + "]");
}
char c = s.charAt(i++);
switch (c) {
case 'n': sb.append('\n'); break;
case 'r': sb.append('\r'); break;
case 't': sb.append('\t'); break;
case 'b': sb.append('\b'); break;
case 'f': sb.append('\f'); break;
case '\\':
case '\"':
case '\'':
case '>':
sb.append(c);
break;
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
--i; // backup to first octal digit
int nOctalDigits = 1;
int digitLimit = c < '4' ? 3 : 2;
while (nOctalDigits < digitLimit && i + nOctalDigits < s.length()
&& isOctal(s.charAt(i + nOctalDigits))) {
++nOctalDigits;
}
sb.append(
(char) Integer.parseInt(s.substring(i, i + nOctalDigits), 8));
i += nOctalDigits;
break;
case 'x':
case 'u':
String hexCode;
int nHexDigits = (c == 'u' ? 4 : 2);
try {
hexCode = s.substring(i, i + nHexDigits);
} catch (IndexOutOfBoundsException ioobe) {
throw new IllegalArgumentException(
"Invalid unicode sequence [" + s.substring(i) + "] at index " + i
+ " in [" + s + "]");
}
int unicodeValue;
try {
unicodeValue = Integer.parseInt(hexCode, 16);
} catch (NumberFormatException nfe) {
throw new IllegalArgumentException(
"Invalid unicode sequence [" + hexCode + "] at index " + i +
" in [" + s + "]");
}
sb.append((char) unicodeValue);
i += nHexDigits;
break;
default:
throw new IllegalArgumentException(
"Unknown escape code [" + c + "] at index " + i + " in [" + s + "]"
);
}
return i;
}
// C0 control characters except \t, \n, and \r and 0xFFFE and 0xFFFF
private static final CharMatcher CONTROL_MATCHER = CharMatcher.anyOf(
"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
"\u0008\u000B\u000C\u000E\u000F" +
"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
"\uFFFE\uFFFF");
/**
* Escape a string that is meant to be embedded in a CDATA section.
* The returned string is guaranteed to be valid CDATA content.
* The syntax of CDATA sections is the following:
*
*
* The only invalid character sequence in a CDATA tag is "]]>".
* If this sequence is present in the input string, we replace
* it by closing the current CDATA field, then write ']]>',
* then reopen a new CDATA section.
*/
public static String xmlCDataEscape(String s) {
// Make sure there are no illegal control characters.
s = CONTROL_MATCHER.removeFrom(s);
// Return the original reference if the string doesn't have a match.
int found = s.indexOf("]]>");
if (found == -1) {
return s;
}
// For each occurrence of "]]>", append a string that adds "]]>" after
// the end of the CDATA which has just been closed, then opens a new CDATA.
StringBuilder sb = new StringBuilder();
int prev = 0;
do {
sb.append(s.substring(prev, found + 3));
sb.append("]]>", prev)) != -1);
sb.append(s.substring(prev));
return sb.toString();
}
/**
* We escape some characters in s to be able to insert strings into Java code
*
* @deprecated Use {@link CharEscapers#asciiHtmlEscaper()} and {@link
* CharEscapers#javaCharEscaper()} or {@link CharEscapers#javaStringEscaper()}
* instead. This method combines two forms of escaping in a way that's rarely
* desired.
*/
@Deprecated
public static String javaEscape(String s) {
return JAVA_ESCAPE.escape(s);
}
// Java escaper.
private static final CharEscaper JAVA_ESCAPE =
new CharEscaperBuilder()
.addEscape('\n', "\\n")
.addEscape('\r', "\\r")
.addEscape('\t', "\\t")
.addEscape('\\', "\\\\")
.addEscape('\"', "\\\"")
.addEscape('&', "&")
.addEscape('<', "<")
.addEscape('>', ">")
.addEscape('\'', "\\\'")
.toEscaper();
/**
* Escapes the special characters from a string so it can be used as part of
* a regex pattern. This method is for use on gnu.regexp style regular
* expressions.
*
* @deprecated Use {@link Pattern#quote(String)} instead. Note that it may not
* be compatible with gnu.regexp style regular expressions.
*/
@Deprecated
public static String regexEscape(String s) {
return REGEX_ESCAPE.escape(s);
}
// Regex escaper escapes all regex characters.
private static final CharEscaper REGEX_ESCAPE =
new CharEscaperBuilder()
.addEscape('(', "\\(")
.addEscape(')', "\\)")
.addEscape('|', "\\|")
.addEscape('*', "\\*")
.addEscape('+', "\\+")
.addEscape('?', "\\?")
.addEscape('.', "\\.")
.addEscape('{', "\\{")
.addEscape('}', "\\}")
.addEscape('[', "\\[")
.addEscape(']', "\\]")
.addEscape('$', "\\$")
.addEscape('^', "\\^")
.addEscape('\\', "\\\\")
.toEscaper();
/**
* If you want to preserve the exact
* current (odd) behavior when {@code doStrip} is {@code true}, use
* {@code .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on
* the splitter.
*
* @param in what to process
* @param delimiter the delimiting string
* @return the tokens
* @deprecated see the detailed instructions under
* {@link #split(String, String, boolean)}
*/
@Deprecated
public static LinkedList<[!CDATA[...]]>
* null
, a new List
will be created.
* @return The collection to which the substrings were added. This is
* syntactic sugar to allow call chaining.
* @deprecated see the detailed instructions under
* {@link #split(String, String, boolean)}
*/
@Deprecated
public static Collection
* E.g. collapse("hello world", " ", "::")
* will return the following string: "hello::world"
*
*
* @param str the string you want to munge
* @param chars all of the characters to be considered for munge
* @param replacement the replacement string
* @return munged and replaced string.
* @deprecated if {@code replacement} is the empty string, use {@link
* CharMatcher#removeFrom(CharSequence)}; if it is a single character,
* use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
* replacement strings use {@link String#replaceAll(String, String)} with
* a regular expression that matches one or more occurrences of {@code
* chars}. In all cases you must first ensure that {@code str} is not
* null.
*/
@Deprecated public static String collapse(
String str, String chars, String replacement) {
if (str == null) {
return null;
}
StringBuilder newStr = new StringBuilder();
boolean prevCharMatched = false;
char c;
for (int i = 0; i < str.length(); i++) {
c = str.charAt(i);
if (chars.indexOf(c) != -1) {
// this character is matched
if (prevCharMatched) {
// apparently a string of matched chars, so don't append anything
// to the string
continue;
}
prevCharMatched = true;
newStr.append(replacement);
} else {
prevCharMatched = false;
newStr.append(c);
}
}
return newStr.toString();
}
/**
* Returns a string with all sequences of ISO control chars (0x00 to 0x1F and
* 0x7F to 0x9F) replaced by the supplied string. ISO control characters are
* identified via {@link Character#isISOControl(char)}.
*
* @param str the string you want to strip of ISO control chars
* @param replacement the replacement string
* @return a String with all control characters replaced by the replacement
* string, or null if input is null.
* @deprecated use {@link CharMatcher#JAVA_ISO_CONTROL}. If {@code
* replacement} is the empty string, use {@link
* CharMatcher#removeFrom(CharSequence)}; if it is a single character,
* use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
* replacement strings use
* {@code str.replaceAll("\p{Cntrl}+", replacement)}.
* In all cases you must first ensure that {@code str} is not null.
*/
@Deprecated public static String collapseControlChars(
String str, String replacement) {
/*
* We re-implement the StringUtil.collapse() loop here rather than call
* collapse() with an input String of control chars, because matching via
* isISOControl() is about 10x faster.
*/
if (str == null) {
return null;
}
StringBuilder newStr = new StringBuilder();
boolean prevCharMatched = false;
char c;
for (int i = 0; i < str.length(); i++) {
c = str.charAt(i);
if (Character.isISOControl(c)) {
// this character is matched
if (prevCharMatched) {
// apparently a string of matched chars, so don't append anything
// to the string
continue;
}
prevCharMatched = true;
newStr.append(replacement);
} else {
prevCharMatched = false;
newStr.append(c);
}
}
return newStr.toString();
}
/**
* Read a String of up to maxLength bytes from an InputStream.
*
*
* CharStreams.toString(new InputStreamReader(is, Charsets.ISO_8859_1))
*
* {@code CharStreams} is in the {@code com.google.common.io} package.
*
*
* CharStreams.toString(new InputStreamReader(
* new LimitInputStream(is, maxLength), Charsets.ISO_8859_1))
*
* For multi-byte encodings that is broken because the limit could end in
* the middle of the character--it would be better to limit the reader than
* the underlying stream.
*
* @param is input stream
* @param maxLength max number of bytes to read from "is". If this is -1, we
* read everything.
*
* @return String up to maxLength bytes, read from "is"
* @deprecated see the advice above
*/
@Deprecated public static String stream2String(InputStream is, int maxLength)
throws IOException {
byte[] buffer = new byte[4096];
StringWriter sw = new StringWriter();
int totalRead = 0;
int read = 0;
do {
sw.write(new String(buffer, 0, read));
totalRead += read;
read = is.read(buffer, 0, buffer.length);
} while (((-1 == maxLength) || (totalRead < maxLength)) && (read != -1));
return sw.toString();
}
/**
* Parse a list of substrings separated by a given delimiter. The delimiter
* can also appear in substrings (just double them):
*
* parseDelimitedString("this|is", '|') returns ["this","is"]
* parseDelimitedString("this||is", '|') returns ["this|is"]
*
* @param list String containing delimited substrings
* @param delimiter Delimiter (anything except ' ' is allowed)
*
* @return String[] A String array of parsed substrings
*/
public static String[] parseDelimitedList(String list,
char delimiter) {
String delim = "" + delimiter;
// Append a sentinel of delimiter + space
// (see comments below for more info)
StringTokenizer st = new StringTokenizer(list + delim + " ",
delim,
true);
ArrayList
*
* capitalize("foo bar").equals("Foo bar");
* capitalize("2b or not 2b").equals("2b or not 2b")
* capitalize("Foo bar").equals("Foo bar");
* capitalize("").equals("");
*
*
* @param s the string whose first character is to be uppercased
* @return a string equivalent to s with its first character
* converted to uppercase
* @throws NullPointerException if s is null
*/
public static String capitalize(String s) {
if (s.length() == 0) {
return s;
}
char first = s.charAt(0);
char capitalized = Character.toUpperCase(first);
return (first == capitalized)
? s
: capitalized + s.substring(1);
}
/**
* Examine a string to see if it starts with a given prefix (case
* insensitive). Just like String.startsWith() except doesn't
* respect case. Strings are compared in the same way as in
* {@link String#equalsIgnoreCase}.
*
* @param str the string to examine
* @param prefix the prefix to look for
* @return a boolean indicating if str starts with prefix (case insensitive)
*/
public static boolean startsWithIgnoreCase(String str, String prefix) {
return str.regionMatches(true, 0, prefix, 0, prefix.length());
}
/**
* Examine a string to see if it ends with a given suffix (case
* insensitive). Just like String.endsWith() except doesn't respect
* case. Strings are compared in the same way as in
* {@link String#equalsIgnoreCase}.
*
* @param str the string to examine
* @param suffix the suffix to look for
* @return a boolean indicating if str ends with suffix (case insensitive)
*/
public static boolean endsWithIgnoreCase(String str, String suffix) {
int len = suffix.length();
return str.regionMatches(true, str.length() - len, suffix, 0, len);
}
/**
* @param c one codePoint
* @return the number of bytes needed to encode this codePoint in UTF-8
*/
private static int bytesUtf8(int c) {
if (c < 0x80) {
return 1;
} else if (c < 0x00800) {
return 2;
} else if (c < 0x10000) {
return 3;
} else if (c < 0x200000) {
return 4;
// RFC 3629 forbids the use of UTF-8 for codePoint greater than 0x10FFFF,
// so if the caller respects this RFC, this should not happen
} else if (c < 0x4000000) {
return 5;
} else {
return 6;
}
}
/**
* @param str a string
* @return the number of bytes required to represent this string in UTF-8
*/
public static int bytesStorage(String str) {
// offsetByCodePoint has a bug if its argument is the result of a
// call to substring. To avoid this, we create a new String
// See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
String s = new String(str);
int len = 0;
for (int i = 0; i < s.length(); i = s.offsetByCodePoints(i, 1)) {
len += bytesUtf8(s.codePointAt(i));
}
return len;
}
/**
* @param str a string
* @param maxbytes
* @return the beginning of the string, so that it uses less than
* maxbytes bytes in UTF-8
* @throws IndexOutOfBoundsException if maxbytes is negative
*/
public static String truncateStringForUtf8Storage(String str, int maxbytes) {
if (maxbytes < 0) {
throw new IndexOutOfBoundsException();
}
// offsetByCodePoint has a bug if its argument is the result of a
// call to substring. To avoid this, we create a new String
// See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
// TODO(cquinn): should be fixed as of 1.5.0_01
String s = new String(str);
int codepoints = 0;
int bytesUsed = 0;
for (codepoints = 0; codepoints < s.length();
codepoints = s.offsetByCodePoints(codepoints, 1)) {
int glyphBytes = StringUtil.bytesUtf8(s.codePointAt(codepoints));
if (bytesUsed + glyphBytes > maxbytes) {
break;
}
bytesUsed += glyphBytes;
}
return s.substring(0, codepoints);
}
/**
* If the given string is of length {@code maxLength} or less, then it is
* returned as is.
* If the string is longer than {@code maxLength}, the returned string is
* truncated before the last space character on or before
* {@code source.charAt(maxLength)}. If the string has no spaces, the
* returned string is truncated to {@code maxLength}.
*
* @param source the string to truncate if necessary
* @param maxLength
* @return the original string if its length is less than or equal to
* maxLength, otherwise a truncated string as mentioned above
*/
public static String truncateIfNecessary(String source, int maxLength) {
if (source.length() <= maxLength) {
return source;
}
String str = unicodePreservingSubstring(source, 0, maxLength);
@SuppressWarnings("deprecation") // we'll make this go away before that does
CharMatcher whitespaceMatcher = CharMatcher.LEGACY_WHITESPACE;
String truncated = whitespaceMatcher.trimTrailingFrom(str);
// We may have had multiple spaces at maxLength, which were stripped away
if (truncated.length() < maxLength) {
return truncated;
}
// We have a truncated string of length maxLength. If the next char was a
// space, we truncated at a word boundary, so we can return immediately
if (Character.isSpaceChar(source.charAt(maxLength))) {
return truncated;
}
// We truncated in the middle of the word. Try to truncate before
// the last space, if it exists. Otherwise, return the truncated string
for (int i = truncated.length() - 1; i >= 0; --i) {
if (Character.isSpaceChar(truncated.charAt(i))) {
String substr = truncated.substring(0, i);
return whitespaceMatcher.trimTrailingFrom(substr);
}
}
return truncated;
}
/**
* If this given string is of length {@code maxLength} or less, it will
* be returned as-is.
* Otherwise it will be trucated to {@code maxLength}, regardless of whether
* there are any space characters in the String. If an ellipsis is requested
* to be appended to the truncated String, the String will be truncated so
* that the ellipsis will also fit within maxLength.
* If no truncation was necessary, no ellipsis will be added.
*
* @param source the String to truncate if necessary
* @param maxLength the maximum number of characters to keep
* @param addEllipsis if true, and if the String had to be truncated,
* add "..." to the end of the String before returning. Additionally,
* the ellipsis will only be added if maxLength is greater than 3.
* @return the original string if its length is less than or equal to
* maxLength, otherwise a truncated string as mentioned above
*/
public static String truncateAtMaxLength(String source, int maxLength,
boolean addEllipsis) {
if (source.length() <= maxLength) {
return source;
}
if (addEllipsis && maxLength > 3) {
return unicodePreservingSubstring(source, 0, maxLength - 3) + "...";
}
return unicodePreservingSubstring(source, 0, maxLength);
}
/**
* Normalizes {@code index} such that it respects Unicode character
* boundaries in {@code str}.
*
*
* {@link #unicodePreservingSubstring(String, int, int)}(
* str, begin, str.length())
*
*/
public static String unicodePreservingSubstring(String str, int begin) {
return unicodePreservingSubstring(str, begin, str.length());
}
/**
* True iff the given character needs to be escaped in a javascript string
* literal.
*
*
* TODO(msamuel): If we're being paranoid, should we escape + to avoid UTF-7
* attacks?
*
According to EcmaScript 262 Section 7.1:
*
* The format control characters can occur anywhere in the source text of
* an ECMAScript program. These characters are removed from the source
* text before applying the lexical grammar.
*
*
* The following characters are considered to be line terminators:
*
* @param codepoint a char instead of an int since the javascript language
* does not support extended unicode.
*/
static boolean mustEscapeCharInJsString(int codepoint) {
return JS_ESCAPE_CHARS.contains(codepoint);
}
/**
* True iff the given character needs to be escaped in a JSON string literal.
*
* Code Point Value Name Formal Name
* \u000A Line Feed [LF]
* \u000D Carriage Return [CR]
* \u2028 Line separator [LS]
* \u2029 Paragraph separator [PS]
*
*
*