1 /* 2 * Copyright (C) 2009 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.escape; 18 19 import static com.google.common.base.Preconditions.checkNotNull; 20 21 import com.google.common.annotations.Beta; 22 import com.google.common.annotations.GwtCompatible; 23 24 import java.util.HashMap; 25 import java.util.Map; 26 27 import javax.annotation.Nullable; 28 29 /** 30 * Static utility methods pertaining to {@link Escaper} instances. 31 * 32 * @author Sven Mawson 33 * @author David Beaumont 34 * @since 15.0 35 */ 36 @Beta 37 @GwtCompatible 38 public final class Escapers { Escapers()39 private Escapers() {} 40 41 /** 42 * Returns an {@link Escaper} that does no escaping, passing all character 43 * data through unchanged. 44 */ nullEscaper()45 public static Escaper nullEscaper() { 46 return NULL_ESCAPER; 47 } 48 49 // An Escaper that efficiently performs no escaping. 50 // Extending CharEscaper (instead of Escaper) makes Escapers.compose() easier. 51 private static final Escaper NULL_ESCAPER = new CharEscaper() { 52 @Override public String escape(String string) { 53 return checkNotNull(string); 54 } 55 56 @Override protected char[] escape(char c) { 57 // TODO: Fix tests not to call this directly and make it throw an error. 58 return null; 59 } 60 }; 61 62 /** 63 * Returns a builder for creating simple, fast escapers. A builder instance 64 * can be reused and each escaper that is created will be a snapshot of the 65 * current builder state. Builders are not thread safe. 66 * 67 * <p>The initial state of the builder is such that: 68 * <ul> 69 * <li>There are no replacement mappings<li> 70 * <li>{@code safeMin == Character.MIN_VALUE}</li> 71 * <li>{@code safeMax == Character.MAX_VALUE}</li> 72 * <li>{@code unsafeReplacement == null}</li> 73 * </ul> 74 * <p>For performance reasons escapers created by this builder are not 75 * Unicode aware and will not validate the well-formedness of their input. 76 */ builder()77 public static Builder builder() { 78 return new Builder(); 79 } 80 81 /** 82 * A builder for simple, fast escapers. 83 * 84 * <p>Typically an escaper needs to deal with the escaping of high valued 85 * characters or code points. In these cases it is necessary to extend either 86 * {@link ArrayBasedCharEscaper} or {@link ArrayBasedUnicodeEscaper} to 87 * provide the desired behavior. However this builder is suitable for creating 88 * escapers that replace a relative small set of characters. 89 * 90 * @author David Beaumont 91 * @since 15.0 92 */ 93 @Beta 94 public static final class Builder { 95 private final Map<Character, String> replacementMap = 96 new HashMap<Character, String>(); 97 private char safeMin = Character.MIN_VALUE; 98 private char safeMax = Character.MAX_VALUE; 99 private String unsafeReplacement = null; 100 101 // The constructor is exposed via the builder() method above. Builder()102 private Builder() {} 103 104 /** 105 * Sets the safe range of characters for the escaper. Characters in this 106 * range that have no explicit replacement are considered 'safe' and remain 107 * unescaped in the output. If {@code safeMax < safeMin} then the safe range 108 * is empty. 109 * 110 * @param safeMin the lowest 'safe' character 111 * @param safeMax the highest 'safe' character 112 * @return the builder instance 113 */ setSafeRange(char safeMin, char safeMax)114 public Builder setSafeRange(char safeMin, char safeMax) { 115 this.safeMin = safeMin; 116 this.safeMax = safeMax; 117 return this; 118 } 119 120 /** 121 * Sets the replacement string for any characters outside the 'safe' range 122 * that have no explicit replacement. If {@code unsafeReplacement} is 123 * {@code null} then no replacement will occur, if it is {@code ""} then 124 * the unsafe characters are removed from the output. 125 * 126 * @param unsafeReplacement the string to replace unsafe chracters 127 * @return the builder instance 128 */ setUnsafeReplacement(@ullable String unsafeReplacement)129 public Builder setUnsafeReplacement(@Nullable String unsafeReplacement) { 130 this.unsafeReplacement = unsafeReplacement; 131 return this; 132 } 133 134 /** 135 * Adds a replacement string for the given input character. The specified 136 * character will be replaced by the given string whenever it occurs in the 137 * input, irrespective of whether it lies inside or outside the 'safe' 138 * range. 139 * 140 * @param c the character to be replaced 141 * @param replacement the string to replace the given character 142 * @return the builder instance 143 * @throws NullPointerException if {@code replacement} is null 144 */ addEscape(char c, String replacement)145 public Builder addEscape(char c, String replacement) { 146 checkNotNull(replacement); 147 // This can replace an existing character (the builder is re-usable). 148 replacementMap.put(c, replacement); 149 return this; 150 } 151 152 /** 153 * Returns a new escaper based on the current state of the builder. 154 */ build()155 public Escaper build() { 156 return new ArrayBasedCharEscaper(replacementMap, safeMin, safeMax) { 157 private final char[] replacementChars = 158 unsafeReplacement != null ? unsafeReplacement.toCharArray() : null; 159 @Override protected char[] escapeUnsafe(char c) { 160 return replacementChars; 161 } 162 }; 163 } 164 } 165 166 /** 167 * Returns a {@link UnicodeEscaper} equivalent to the given escaper instance. 168 * If the escaper is already a UnicodeEscaper then it is simply returned, 169 * otherwise it is wrapped in a UnicodeEscaper. 170 * 171 * <p>When a {@link CharEscaper} escaper is wrapped by this method it acquires 172 * extra behavior with respect to the well-formedness of Unicode character 173 * sequences and will throw {@link IllegalArgumentException} when given bad 174 * input. 175 * 176 * @param escaper the instance to be wrapped 177 * @return a UnicodeEscaper with the same behavior as the given instance 178 * @throws NullPointerException if escaper is null 179 * @throws IllegalArgumentException if escaper is not a UnicodeEscaper or a 180 * CharEscaper 181 */ asUnicodeEscaper(Escaper escaper)182 static UnicodeEscaper asUnicodeEscaper(Escaper escaper) { 183 checkNotNull(escaper); 184 if (escaper instanceof UnicodeEscaper) { 185 return (UnicodeEscaper) escaper; 186 } else if (escaper instanceof CharEscaper) { 187 return wrap((CharEscaper) escaper); 188 } 189 // In practice this shouldn't happen because it would be very odd not to 190 // extend either CharEscaper or UnicodeEscaper for non trivial cases. 191 throw new IllegalArgumentException("Cannot create a UnicodeEscaper from: " + 192 escaper.getClass().getName()); 193 } 194 195 /** 196 * Returns a string that would replace the given character in the specified 197 * escaper, or {@code null} if no replacement should be made. This method is 198 * intended for use in tests through the {@code EscaperAsserts} class; 199 * production users of {@link CharEscaper} should limit themselves to its 200 * public interface. 201 * 202 * @param c the character to escape if necessary 203 * @return the replacement string, or {@code null} if no escaping was needed 204 */ computeReplacement(CharEscaper escaper, char c)205 public static String computeReplacement(CharEscaper escaper, char c) { 206 return stringOrNull(escaper.escape(c)); 207 } 208 209 /** 210 * Returns a string that would replace the given character in the specified 211 * escaper, or {@code null} if no replacement should be made. This method is 212 * intended for use in tests through the {@code EscaperAsserts} class; 213 * production users of {@link UnicodeEscaper} should limit themselves to its 214 * public interface. 215 * 216 * @param cp the Unicode code point to escape if necessary 217 * @return the replacement string, or {@code null} if no escaping was needed 218 */ computeReplacement(UnicodeEscaper escaper, int cp)219 public static String computeReplacement(UnicodeEscaper escaper, int cp) { 220 return stringOrNull(escaper.escape(cp)); 221 } 222 stringOrNull(char[] in)223 private static String stringOrNull(char[] in) { 224 return (in == null) ? null : new String(in); 225 } 226 227 /** Private helper to wrap a CharEscaper as a UnicodeEscaper. */ wrap(final CharEscaper escaper)228 private static UnicodeEscaper wrap(final CharEscaper escaper) { 229 return new UnicodeEscaper() { 230 @Override protected char[] escape(int cp) { 231 // If a code point maps to a single character, just escape that. 232 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 233 return escaper.escape((char) cp); 234 } 235 // Convert the code point to a surrogate pair and escape them both. 236 // Note: This code path is horribly slow and typically allocates 4 new 237 // char[] each time it is invoked. However this avoids any 238 // synchronization issues and makes the escaper thread safe. 239 char[] surrogateChars = new char[2]; 240 Character.toChars(cp, surrogateChars, 0); 241 char[] hiChars = escaper.escape(surrogateChars[0]); 242 char[] loChars = escaper.escape(surrogateChars[1]); 243 244 // If either hiChars or lowChars are non-null, the CharEscaper is trying 245 // to escape the characters of a surrogate pair separately. This is 246 // uncommon and applies only to escapers that assume UCS-2 rather than 247 // UTF-16. See: http://en.wikipedia.org/wiki/UTF-16/UCS-2 248 if (hiChars == null && loChars == null) { 249 // We expect this to be the common code path for most escapers. 250 return null; 251 } 252 // Combine the characters and/or escaped sequences into a single array. 253 int hiCount = hiChars != null ? hiChars.length : 1; 254 int loCount = loChars != null ? loChars.length : 1; 255 char[] output = new char[hiCount + loCount]; 256 if (hiChars != null) { 257 // TODO: Is this faster than System.arraycopy() for small arrays? 258 for (int n = 0; n < hiChars.length; ++n) { 259 output[n] = hiChars[n]; 260 } 261 } else { 262 output[0] = surrogateChars[0]; 263 } 264 if (loChars != null) { 265 for (int n = 0; n < loChars.length; ++n) { 266 output[hiCount + n] = loChars[n]; 267 } 268 } else { 269 output[hiCount] = surrogateChars[1]; 270 } 271 return output; 272 } 273 }; 274 } 275 } 276