1 /* Copyright (c) 2008 Google Inc. 2 * 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 package org.yaml.snakeyaml.external.com.google.gdata.util.common.base; 17 18 /** 19 * A {@code UnicodeEscaper} that escapes some set of Java characters using the 20 * URI percent encoding scheme. The set of safe characters (those which remain 21 * unescaped) can be specified on construction. 22 * 23 * <p> 24 * For details on escaping URIs for use in web pages, see section 2.4 of <a 25 * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>. 26 * 27 * <p> 28 * In most cases this class should not need to be used directly. If you have no 29 * special requirements for escaping your URIs, you should use either 30 * {@link CharEscapers#uriEscaper()} or {@link CharEscapers#uriEscaper(boolean)}. 31 * 32 * <p> 33 * When encoding a String, the following rules apply: 34 * <ul> 35 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" 36 * through "9" remain the same. 37 * <li>Any additionally specified safe characters remain the same. 38 * <li>If {@code plusForSpace} was specified, the space character " " is 39 * converted into a plus sign "+". 40 * <li>All other characters are converted into one or more bytes using UTF-8 41 * encoding and each byte is then represented by the 3-character string "%XY", 42 * where "XY" is the two-digit, uppercase, hexadecimal representation of the 43 * byte value. 44 * </ul> 45 * 46 * <p> 47 * RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!", 48 * "~", "*", "'", "(" and ")". It goes on to state: 49 * 50 * <p> 51 * <i>Unreserved characters can be escaped without changing the semantics of the 52 * URI, but this should not be done unless the URI is being used in a context 53 * that does not allow the unescaped character to appear.</i> 54 * 55 * <p> 56 * For performance reasons the only currently supported character encoding of 57 * this class is UTF-8. 58 * 59 * <p> 60 * <b>Note</b>: This escaper produces uppercase hexidecimal sequences. From <a 61 * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br> 62 * <i>"URI producers and normalizers should use uppercase hexadecimal digits for 63 * all percent-encodings."</i> 64 * 65 * 66 */ 67 public class PercentEscaper extends UnicodeEscaper { 68 /** 69 * A string of safe characters that mimics the behavior of 70 * {@link java.net.URLEncoder}. 71 * 72 */ 73 public static final String SAFECHARS_URLENCODER = "-_.*"; 74 75 /** 76 * A string of characters that do not need to be encoded when used in URI 77 * path segments, as specified in RFC 3986. Note that some of these 78 * characters do need to be escaped when used in other parts of the URI. 79 */ 80 public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;="; 81 82 /** 83 * A string of characters that do not need to be encoded when used in URI 84 * query strings, as specified in RFC 3986. Note that some of these 85 * characters do need to be escaped when used in other parts of the URI. 86 */ 87 public static final String SAFEQUERYSTRINGCHARS_URLENCODER = "-_.!~*'()@:$,;/?:"; 88 89 // In some uri escapers spaces are escaped to '+' 90 private static final char[] URI_ESCAPED_SPACE = { '+' }; 91 92 private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray(); 93 94 /** 95 * If true we should convert space to the {@code +} character. 96 */ 97 private final boolean plusForSpace; 98 99 /** 100 * An array of flags where for any {@code char c} if {@code safeOctets[c]} 101 * is true then {@code c} should remain unmodified in the output. If 102 * {@code c > safeOctets.length} then it should be escaped. 103 */ 104 private final boolean[] safeOctets; 105 106 /** 107 * Constructs a URI escaper with the specified safe characters and optional 108 * handling of the space character. 109 * 110 * @param safeChars 111 * a non null string specifying additional safe characters for 112 * this escaper (the ranges 0..9, a..z and A..Z are always safe 113 * and should not be specified here) 114 * @param plusForSpace 115 * true if ASCII space should be escaped to {@code +} rather than 116 * {@code %20} 117 * @throws IllegalArgumentException 118 * if any of the parameters were invalid 119 */ PercentEscaper(String safeChars, boolean plusForSpace)120 public PercentEscaper(String safeChars, boolean plusForSpace) { 121 // Avoid any misunderstandings about the behavior of this escaper 122 if (safeChars.matches(".*[0-9A-Za-z].*")) { 123 throw new IllegalArgumentException( 124 "Alphanumeric characters are always 'safe' and should not be " 125 + "explicitly specified"); 126 } 127 // Avoid ambiguous parameters. Safe characters are never modified so if 128 // space is a safe character then setting plusForSpace is meaningless. 129 if (plusForSpace && safeChars.contains(" ")) { 130 throw new IllegalArgumentException( 131 "plusForSpace cannot be specified when space is a 'safe' character"); 132 } 133 if (safeChars.contains("%")) { 134 throw new IllegalArgumentException("The '%' character cannot be specified as 'safe'"); 135 } 136 this.plusForSpace = plusForSpace; 137 this.safeOctets = createSafeOctets(safeChars); 138 } 139 140 /** 141 * Creates a boolean[] with entries corresponding to the character values 142 * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array 143 * is as small as is required to hold the given character information. 144 */ createSafeOctets(String safeChars)145 private static boolean[] createSafeOctets(String safeChars) { 146 int maxChar = 'z'; 147 char[] safeCharArray = safeChars.toCharArray(); 148 for (char c : safeCharArray) { 149 maxChar = Math.max(c, maxChar); 150 } 151 boolean[] octets = new boolean[maxChar + 1]; 152 for (int c = '0'; c <= '9'; c++) { 153 octets[c] = true; 154 } 155 for (int c = 'A'; c <= 'Z'; c++) { 156 octets[c] = true; 157 } 158 for (int c = 'a'; c <= 'z'; c++) { 159 octets[c] = true; 160 } 161 for (char c : safeCharArray) { 162 octets[c] = true; 163 } 164 return octets; 165 } 166 167 /* 168 * Overridden for performance. For unescaped strings this improved the 169 * performance of the uri escaper from ~760ns to ~400ns as measured by 170 * {@link CharEscapersBenchmark}. 171 */ 172 @Override nextEscapeIndex(CharSequence csq, int index, int end)173 protected int nextEscapeIndex(CharSequence csq, int index, int end) { 174 for (; index < end; index++) { 175 char c = csq.charAt(index); 176 if (c >= safeOctets.length || !safeOctets[c]) { 177 break; 178 } 179 } 180 return index; 181 } 182 183 /* 184 * Overridden for performance. For unescaped strings this improved the 185 * performance of the uri escaper from ~400ns to ~170ns as measured by 186 * {@link CharEscapersBenchmark}. 187 */ 188 @Override escape(String s)189 public String escape(String s) { 190 int slen = s.length(); 191 for (int index = 0; index < slen; index++) { 192 char c = s.charAt(index); 193 if (c >= safeOctets.length || !safeOctets[c]) { 194 return escapeSlow(s, index); 195 } 196 } 197 return s; 198 } 199 200 /** 201 * Escapes the given Unicode code point in UTF-8. 202 */ 203 @Override escape(int cp)204 protected char[] escape(int cp) { 205 // We should never get negative values here but if we do it will throw 206 // an 207 // IndexOutOfBoundsException, so at least it will get spotted. 208 if (cp < safeOctets.length && safeOctets[cp]) { 209 return null; 210 } else if (cp == ' ' && plusForSpace) { 211 return URI_ESCAPED_SPACE; 212 } else if (cp <= 0x7F) { 213 // Single byte UTF-8 characters 214 // Start with "%--" and fill in the blanks 215 char[] dest = new char[3]; 216 dest[0] = '%'; 217 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 218 dest[1] = UPPER_HEX_DIGITS[cp >>> 4]; 219 return dest; 220 } else if (cp <= 0x7ff) { 221 // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff] 222 // Start with "%--%--" and fill in the blanks 223 char[] dest = new char[6]; 224 dest[0] = '%'; 225 dest[3] = '%'; 226 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 227 cp >>>= 4; 228 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 229 cp >>>= 2; 230 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 231 cp >>>= 4; 232 dest[1] = UPPER_HEX_DIGITS[0xC | cp]; 233 return dest; 234 } else if (cp <= 0xffff) { 235 // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff] 236 // Start with "%E-%--%--" and fill in the blanks 237 char[] dest = new char[9]; 238 dest[0] = '%'; 239 dest[1] = 'E'; 240 dest[3] = '%'; 241 dest[6] = '%'; 242 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 243 cp >>>= 4; 244 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 245 cp >>>= 2; 246 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 247 cp >>>= 4; 248 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 249 cp >>>= 2; 250 dest[2] = UPPER_HEX_DIGITS[cp]; 251 return dest; 252 } else if (cp <= 0x10ffff) { 253 char[] dest = new char[12]; 254 // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff] 255 // Start with "%F-%--%--%--" and fill in the blanks 256 dest[0] = '%'; 257 dest[1] = 'F'; 258 dest[3] = '%'; 259 dest[6] = '%'; 260 dest[9] = '%'; 261 dest[11] = UPPER_HEX_DIGITS[cp & 0xF]; 262 cp >>>= 4; 263 dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 264 cp >>>= 2; 265 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 266 cp >>>= 4; 267 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 268 cp >>>= 2; 269 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 270 cp >>>= 4; 271 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 272 cp >>>= 2; 273 dest[2] = UPPER_HEX_DIGITS[cp & 0x7]; 274 return dest; 275 } else { 276 // If this ever happens it is due to bug in UnicodeEscaper, not bad 277 // input. 278 throw new IllegalArgumentException("Invalid unicode character value " + cp); 279 } 280 } 281 } 282