• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (c) 2008 Google Inc.
2  *
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 package org.yaml.snakeyaml.external.com.google.gdata.util.common.base;
17 
18 /**
19  * A {@code UnicodeEscaper} that escapes some set of Java characters using the
20  * URI percent encoding scheme. The set of safe characters (those which remain
21  * unescaped) can be specified on construction.
22  *
23  * <p>
24  * For details on escaping URIs for use in web pages, see section 2.4 of <a
25  * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
26  *
27  * <p>
28  * In most cases this class should not need to be used directly. If you have no
29  * special requirements for escaping your URIs, you should use either
30  * {@link CharEscapers#uriEscaper()} or {@link CharEscapers#uriEscaper(boolean)}.
31  *
32  * <p>
33  * When encoding a String, the following rules apply:
34  * <ul>
35  * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
36  * through "9" remain the same.
37  * <li>Any additionally specified safe characters remain the same.
38  * <li>If {@code plusForSpace} was specified, the space character " " is
39  * converted into a plus sign "+".
40  * <li>All other characters are converted into one or more bytes using UTF-8
41  * encoding and each byte is then represented by the 3-character string "%XY",
42  * where "XY" is the two-digit, uppercase, hexadecimal representation of the
43  * byte value.
44  * </ul>
45  *
46  * <p>
47  * RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
48  * "~", "*", "'", "(" and ")". It goes on to state:
49  *
50  * <p>
51  * <i>Unreserved characters can be escaped without changing the semantics of the
52  * URI, but this should not be done unless the URI is being used in a context
53  * that does not allow the unescaped character to appear.</i>
54  *
55  * <p>
56  * For performance reasons the only currently supported character encoding of
57  * this class is UTF-8.
58  *
59  * <p>
60  * <b>Note</b>: This escaper produces uppercase hexidecimal sequences. From <a
61  * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
62  * <i>"URI producers and normalizers should use uppercase hexadecimal digits for
63  * all percent-encodings."</i>
64  *
65  *
66  */
67 public class PercentEscaper extends UnicodeEscaper {
68     /**
69      * A string of safe characters that mimics the behavior of
70      * {@link java.net.URLEncoder}.
71      *
72      */
73     public static final String SAFECHARS_URLENCODER = "-_.*";
74 
75     /**
76      * A string of characters that do not need to be encoded when used in URI
77      * path segments, as specified in RFC 3986. Note that some of these
78      * characters do need to be escaped when used in other parts of the URI.
79      */
80     public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";
81 
82     /**
83      * A string of characters that do not need to be encoded when used in URI
84      * query strings, as specified in RFC 3986. Note that some of these
85      * characters do need to be escaped when used in other parts of the URI.
86      */
87     public static final String SAFEQUERYSTRINGCHARS_URLENCODER = "-_.!~*'()@:$,;/?:";
88 
89     // In some uri escapers spaces are escaped to '+'
90     private static final char[] URI_ESCAPED_SPACE = { '+' };
91 
92     private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray();
93 
94     /**
95      * If true we should convert space to the {@code +} character.
96      */
97     private final boolean plusForSpace;
98 
99     /**
100      * An array of flags where for any {@code char c} if {@code safeOctets[c]}
101      * is true then {@code c} should remain unmodified in the output. If
102      * {@code c > safeOctets.length} then it should be escaped.
103      */
104     private final boolean[] safeOctets;
105 
106     /**
107      * Constructs a URI escaper with the specified safe characters and optional
108      * handling of the space character.
109      *
110      * @param safeChars
111      *            a non null string specifying additional safe characters for
112      *            this escaper (the ranges 0..9, a..z and A..Z are always safe
113      *            and should not be specified here)
114      * @param plusForSpace
115      *            true if ASCII space should be escaped to {@code +} rather than
116      *            {@code %20}
117      * @throws IllegalArgumentException
118      *             if any of the parameters were invalid
119      */
PercentEscaper(String safeChars, boolean plusForSpace)120     public PercentEscaper(String safeChars, boolean plusForSpace) {
121         // Avoid any misunderstandings about the behavior of this escaper
122         if (safeChars.matches(".*[0-9A-Za-z].*")) {
123             throw new IllegalArgumentException(
124                     "Alphanumeric characters are always 'safe' and should not be "
125                             + "explicitly specified");
126         }
127         // Avoid ambiguous parameters. Safe characters are never modified so if
128         // space is a safe character then setting plusForSpace is meaningless.
129         if (plusForSpace && safeChars.contains(" ")) {
130             throw new IllegalArgumentException(
131                     "plusForSpace cannot be specified when space is a 'safe' character");
132         }
133         if (safeChars.contains("%")) {
134             throw new IllegalArgumentException("The '%' character cannot be specified as 'safe'");
135         }
136         this.plusForSpace = plusForSpace;
137         this.safeOctets = createSafeOctets(safeChars);
138     }
139 
140     /**
141      * Creates a boolean[] with entries corresponding to the character values
142      * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
143      * is as small as is required to hold the given character information.
144      */
createSafeOctets(String safeChars)145     private static boolean[] createSafeOctets(String safeChars) {
146         int maxChar = 'z';
147         char[] safeCharArray = safeChars.toCharArray();
148         for (char c : safeCharArray) {
149             maxChar = Math.max(c, maxChar);
150         }
151         boolean[] octets = new boolean[maxChar + 1];
152         for (int c = '0'; c <= '9'; c++) {
153             octets[c] = true;
154         }
155         for (int c = 'A'; c <= 'Z'; c++) {
156             octets[c] = true;
157         }
158         for (int c = 'a'; c <= 'z'; c++) {
159             octets[c] = true;
160         }
161         for (char c : safeCharArray) {
162             octets[c] = true;
163         }
164         return octets;
165     }
166 
167     /*
168      * Overridden for performance. For unescaped strings this improved the
169      * performance of the uri escaper from ~760ns to ~400ns as measured by
170      * {@link CharEscapersBenchmark}.
171      */
172     @Override
nextEscapeIndex(CharSequence csq, int index, int end)173     protected int nextEscapeIndex(CharSequence csq, int index, int end) {
174         for (; index < end; index++) {
175             char c = csq.charAt(index);
176             if (c >= safeOctets.length || !safeOctets[c]) {
177                 break;
178             }
179         }
180         return index;
181     }
182 
183     /*
184      * Overridden for performance. For unescaped strings this improved the
185      * performance of the uri escaper from ~400ns to ~170ns as measured by
186      * {@link CharEscapersBenchmark}.
187      */
188     @Override
escape(String s)189     public String escape(String s) {
190         int slen = s.length();
191         for (int index = 0; index < slen; index++) {
192             char c = s.charAt(index);
193             if (c >= safeOctets.length || !safeOctets[c]) {
194                 return escapeSlow(s, index);
195             }
196         }
197         return s;
198     }
199 
200     /**
201      * Escapes the given Unicode code point in UTF-8.
202      */
203     @Override
escape(int cp)204     protected char[] escape(int cp) {
205         // We should never get negative values here but if we do it will throw
206         // an
207         // IndexOutOfBoundsException, so at least it will get spotted.
208         if (cp < safeOctets.length && safeOctets[cp]) {
209             return null;
210         } else if (cp == ' ' && plusForSpace) {
211             return URI_ESCAPED_SPACE;
212         } else if (cp <= 0x7F) {
213             // Single byte UTF-8 characters
214             // Start with "%--" and fill in the blanks
215             char[] dest = new char[3];
216             dest[0] = '%';
217             dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
218             dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
219             return dest;
220         } else if (cp <= 0x7ff) {
221             // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
222             // Start with "%--%--" and fill in the blanks
223             char[] dest = new char[6];
224             dest[0] = '%';
225             dest[3] = '%';
226             dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
227             cp >>>= 4;
228             dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
229             cp >>>= 2;
230             dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
231             cp >>>= 4;
232             dest[1] = UPPER_HEX_DIGITS[0xC | cp];
233             return dest;
234         } else if (cp <= 0xffff) {
235             // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
236             // Start with "%E-%--%--" and fill in the blanks
237             char[] dest = new char[9];
238             dest[0] = '%';
239             dest[1] = 'E';
240             dest[3] = '%';
241             dest[6] = '%';
242             dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
243             cp >>>= 4;
244             dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
245             cp >>>= 2;
246             dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
247             cp >>>= 4;
248             dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
249             cp >>>= 2;
250             dest[2] = UPPER_HEX_DIGITS[cp];
251             return dest;
252         } else if (cp <= 0x10ffff) {
253             char[] dest = new char[12];
254             // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
255             // Start with "%F-%--%--%--" and fill in the blanks
256             dest[0] = '%';
257             dest[1] = 'F';
258             dest[3] = '%';
259             dest[6] = '%';
260             dest[9] = '%';
261             dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
262             cp >>>= 4;
263             dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
264             cp >>>= 2;
265             dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
266             cp >>>= 4;
267             dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
268             cp >>>= 2;
269             dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
270             cp >>>= 4;
271             dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
272             cp >>>= 2;
273             dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
274             return dest;
275         } else {
276             // If this ever happens it is due to bug in UnicodeEscaper, not bad
277             // input.
278             throw new IllegalArgumentException("Invalid unicode character value " + cp);
279         }
280     }
281 }
282