• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2008 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5  * in compliance with the License. You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software distributed under the License
10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11  * or implied. See the License for the specific language governing permissions and limitations under
12  * the License.
13  */
14 
15 package com.google.common.net;
16 
17 import static com.google.common.base.Preconditions.checkNotNull;
18 
19 import com.google.common.annotations.GwtCompatible;
20 import com.google.common.escape.UnicodeEscaper;
21 import javax.annotation.CheckForNull;
22 
23 /**
24  * A {@code UnicodeEscaper} that escapes some set of Java characters using a UTF-8 based percent
25  * encoding scheme. The set of safe characters (those which remain unescaped) can be specified on
26  * construction.
27  *
28  * <p>This class is primarily used for creating URI escapers in {@link UrlEscapers} but can be used
29  * directly if required. While URI escapers impose specific semantics on which characters are
30  * considered 'safe', this class has a minimal set of restrictions.
31  *
32  * <p>When escaping a String, the following rules apply:
33  *
34  * <ul>
35  *   <li>All specified safe characters remain unchanged.
36  *   <li>If {@code plusForSpace} was specified, the space character " " is converted into a plus
37  *       sign {@code "+"}.
38  *   <li>All other characters are converted into one or more bytes using UTF-8 encoding and each
39  *       byte is then represented by the 3-character string "%XX", where "XX" is the two-digit,
40  *       uppercase, hexadecimal representation of the byte value.
41  * </ul>
42  *
43  * <p>For performance reasons the only currently supported character encoding of this class is
44  * UTF-8.
45  *
46  * <p><b>Note:</b> This escaper produces <a
47  * href="https://url.spec.whatwg.org/#percent-encode">uppercase</a> hexadecimal sequences.
48  *
49  * @author David Beaumont
50  * @since 15.0
51  */
52 @GwtCompatible
53 @ElementTypesAreNonnullByDefault
54 public final class PercentEscaper extends UnicodeEscaper {
55 
56   // In some escapers spaces are escaped to '+'
57   private static final char[] PLUS_SIGN = {'+'};
58 
59   // Percent escapers output upper case hex digits (uri escapers require this).
60   private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray();
61 
62   /** If true we should convert space to the {@code +} character. */
63   private final boolean plusForSpace;
64 
65   /**
66    * An array of flags where for any {@code char c} if {@code safeOctets[c]} is true then {@code c}
67    * should remain unmodified in the output. If {@code c >= safeOctets.length} then it should be
68    * escaped.
69    */
70   private final boolean[] safeOctets;
71 
72   /**
73    * Constructs a percent escaper with the specified safe characters and optional handling of the
74    * space character.
75    *
76    * <p>Not that it is allowed, but not necessarily desirable to specify {@code %} as a safe
77    * character. This has the effect of creating an escaper which has no well-defined inverse but it
78    * can be useful when escaping additional characters.
79    *
80    * @param safeChars a non-null string specifying additional safe characters for this escaper (the
81    *     ranges 0..9, a..z and A..Z are always safe and should not be specified here)
82    * @param plusForSpace true if ASCII space should be escaped to {@code +} rather than {@code %20}
83    * @throws IllegalArgumentException if any of the parameters were invalid
84    */
PercentEscaper(String safeChars, boolean plusForSpace)85   public PercentEscaper(String safeChars, boolean plusForSpace) {
86     // TODO(dbeaumont): Switch to static factory methods for creation now that class is final.
87     // TODO(dbeaumont): Support escapers where alphanumeric chars are not safe.
88     checkNotNull(safeChars); // eager for GWT.
89     // Avoid any misunderstandings about the behavior of this escaper
90     if (safeChars.matches(".*[0-9A-Za-z].*")) {
91       throw new IllegalArgumentException(
92           "Alphanumeric characters are always 'safe' and should not be explicitly specified");
93     }
94     safeChars += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
95     // Avoid ambiguous parameters. Safe characters are never modified so if
96     // space is a safe character then setting plusForSpace is meaningless.
97     if (plusForSpace && safeChars.contains(" ")) {
98       throw new IllegalArgumentException(
99           "plusForSpace cannot be specified when space is a 'safe' character");
100     }
101     this.plusForSpace = plusForSpace;
102     this.safeOctets = createSafeOctets(safeChars);
103   }
104 
105   /**
106    * Creates a boolean array with entries corresponding to the character values specified in
107    * safeChars set to true. The array is as small as is required to hold the given character
108    * information.
109    */
createSafeOctets(String safeChars)110   private static boolean[] createSafeOctets(String safeChars) {
111     int maxChar = -1;
112     char[] safeCharArray = safeChars.toCharArray();
113     for (char c : safeCharArray) {
114       maxChar = Math.max(c, maxChar);
115     }
116     boolean[] octets = new boolean[maxChar + 1];
117     for (char c : safeCharArray) {
118       octets[c] = true;
119     }
120     return octets;
121   }
122 
123   /*
124    * Overridden for performance. For unescaped strings this improved the performance of the uri
125    * escaper from ~760ns to ~400ns as measured by {@link CharEscapersBenchmark}.
126    */
127   @Override
nextEscapeIndex(CharSequence csq, int index, int end)128   protected int nextEscapeIndex(CharSequence csq, int index, int end) {
129     checkNotNull(csq);
130     for (; index < end; index++) {
131       char c = csq.charAt(index);
132       if (c >= safeOctets.length || !safeOctets[c]) {
133         break;
134       }
135     }
136     return index;
137   }
138 
139   /*
140    * Overridden for performance. For unescaped strings this improved the performance of the uri
141    * escaper from ~400ns to ~170ns as measured by {@link CharEscapersBenchmark}.
142    */
143   @Override
escape(String s)144   public String escape(String s) {
145     checkNotNull(s);
146     int slen = s.length();
147     for (int index = 0; index < slen; index++) {
148       char c = s.charAt(index);
149       if (c >= safeOctets.length || !safeOctets[c]) {
150         return escapeSlow(s, index);
151       }
152     }
153     return s;
154   }
155 
156   /** Escapes the given Unicode code point in UTF-8. */
157   @Override
158   @CheckForNull
escape(int cp)159   protected char[] escape(int cp) {
160     // We should never get negative values here but if we do it will throw an
161     // IndexOutOfBoundsException, so at least it will get spotted.
162     if (cp < safeOctets.length && safeOctets[cp]) {
163       return null;
164     } else if (cp == ' ' && plusForSpace) {
165       return PLUS_SIGN;
166     } else if (cp <= 0x7F) {
167       // Single byte UTF-8 characters
168       // Start with "%--" and fill in the blanks
169       char[] dest = new char[3];
170       dest[0] = '%';
171       dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
172       dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
173       return dest;
174     } else if (cp <= 0x7ff) {
175       // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
176       // Start with "%--%--" and fill in the blanks
177       char[] dest = new char[6];
178       dest[0] = '%';
179       dest[3] = '%';
180       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
181       cp >>>= 4;
182       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
183       cp >>>= 2;
184       dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
185       cp >>>= 4;
186       dest[1] = UPPER_HEX_DIGITS[0xC | cp];
187       return dest;
188     } else if (cp <= 0xffff) {
189       // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
190       // Start with "%E-%--%--" and fill in the blanks
191       char[] dest = new char[9];
192       dest[0] = '%';
193       dest[1] = 'E';
194       dest[3] = '%';
195       dest[6] = '%';
196       dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
197       cp >>>= 4;
198       dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
199       cp >>>= 2;
200       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
201       cp >>>= 4;
202       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
203       cp >>>= 2;
204       dest[2] = UPPER_HEX_DIGITS[cp];
205       return dest;
206     } else if (cp <= 0x10ffff) {
207       char[] dest = new char[12];
208       // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
209       // Start with "%F-%--%--%--" and fill in the blanks
210       dest[0] = '%';
211       dest[1] = 'F';
212       dest[3] = '%';
213       dest[6] = '%';
214       dest[9] = '%';
215       dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
216       cp >>>= 4;
217       dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
218       cp >>>= 2;
219       dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
220       cp >>>= 4;
221       dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
222       cp >>>= 2;
223       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
224       cp >>>= 4;
225       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
226       cp >>>= 2;
227       dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
228       return dest;
229     } else {
230       // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
231       throw new IllegalArgumentException("Invalid unicode character value " + cp);
232     }
233   }
234 }
235