• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2006, Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.google.android.mail.common.base;
17 
18 import static com.google.android.mail.common.base.Preconditions.checkNotNull;
19 
20 import java.io.IOException;
21 
22 /**
23  * Utility functions for dealing with {@code CharEscaper}s, and some commonly
24  * used {@code CharEscaper} instances.
25  *
26  * @author sven@google.com (Sven Mawson)
27  * @author laurence@google.com (Laurence Gonsalves)
28  */
29 public final class CharEscapers {
CharEscapers()30   private CharEscapers() {}
31 
32   // TODO(matevossian): To implementors of escapers --
33   //                    For each xxxEscaper method, please add links to external
34   //                    reference pages that we consider authoritative for what
35   //                    that escaper should exactly be doing.
36 
37   /**
38    * Performs no escaping.
39    */
40   private static final CharEscaper NULL_ESCAPER = new CharEscaper() {
41       @Override
42     public String escape(String string) {
43         checkNotNull(string);
44         return string;
45       }
46 
47       @Override
48       public Appendable escape(final Appendable out) {
49         checkNotNull(out);
50 
51         // we can't simply return out because the CharEscaper contract says that
52         // the returned Appendable will throw a NullPointerException if asked to
53         // append null.
54         return new Appendable() {
55             @Override public Appendable append(CharSequence csq) throws IOException {
56               checkNotNull(csq);
57               out.append(csq);
58               return this;
59             }
60 
61             @Override public Appendable append(CharSequence csq, int start, int end)
62                 throws IOException {
63               checkNotNull(csq);
64               out.append(csq, start, end);
65               return this;
66             }
67 
68             @Override public Appendable append(char c) throws IOException {
69               out.append(c);
70               return this;
71             }
72           };
73       }
74 
75       @Override
76       protected char[] escape(char c) {
77         return null;
78       }
79     };
80 
81   /**
82    * Returns a {@link CharEscaper} that does no escaping.
83    */
nullEscaper()84   public static CharEscaper nullEscaper() {
85     return NULL_ESCAPER;
86   }
87 
88   /**
89    * Returns a {@link CharEscaper} instance that escapes special characters in a
90    * string so it can safely be included in an XML document in either element
91    * content or attribute values.
92    *
93    * <p><b>Note</b></p>: silently removes null-characters and control
94    * characters, as there is no way to represent them in XML.
95    */
xmlEscaper()96   public static CharEscaper xmlEscaper() {
97     return XML_ESCAPER;
98   }
99 
100   /**
101    * Escapes special characters from a string so it can safely be included in an
102    * XML document in either element content or attribute values.  Also removes
103    * null-characters and control characters, as there is no way to represent
104    * them in XML.
105    */
106   private static final CharEscaper XML_ESCAPER = newBasicXmlEscapeBuilder()
107       .addEscape('"', "&quot;")
108       .addEscape('\'', "&apos;")
109       .toEscaper();
110 
111   /**
112    * Returns a {@link CharEscaper} instance that escapes special characters in a
113    * string so it can safely be included in an XML document in element content.
114    *
115    * <p><b>Note</b></p>: double and single quotes are not escaped, so it is not
116    * safe to use this escaper to escape attribute values. Use the
117    * {@link #xmlEscaper()} escaper to escape attribute values or if you are
118    * unsure. Also silently removes non-whitespace control characters, as there
119    * is no way to represent them in XML.
120    */
xmlContentEscaper()121   public static CharEscaper xmlContentEscaper() {
122     return XML_CONTENT_ESCAPER;
123   }
124 
125   /**
126    * Escapes special characters from a string so it can safely be included in an
127    * XML document in element content.  Note that quotes are <em>not</em>
128    * escaped, so <em>this is not safe for use in attribute values</em>. Use
129    * {@link #XML_ESCAPER} for attribute values, or if you are unsure.  Also
130    * removes non-whitespace control characters, as there is no way to represent
131    * them in XML.
132    */
133   private static final CharEscaper XML_CONTENT_ESCAPER =
134       newBasicXmlEscapeBuilder().toEscaper();
135 
136   /**
137    * Returns a {@link CharEscaper} instance that escapes special characters in a
138    * string so it can safely be included in an HTML document in either element
139    * content or attribute values.
140    *
141    * <p><b>Note</b></p>: alters non-ASCII and control characters.
142    *
143    * The entity list was taken from:
144    * <a href="http://www.w3.org/TR/html4/sgml/entities.html">here</a>
145    */
htmlEscaper()146   public static CharEscaper htmlEscaper() {
147     return HtmlEscaperHolder.HTML_ESCAPER;
148   }
149 
150   /**
151    * A lazy initialization holder for HTML_ESCAPER.
152    */
153   private static class HtmlEscaperHolder {
154     private static final CharEscaper HTML_ESCAPER
155         = new HtmlCharEscaper(new CharEscaperBuilder()
156             .addEscape('"',      "&quot;")
157             .addEscape('\'',     "&#39;")
158             .addEscape('&',      "&amp;")
159             .addEscape('<',      "&lt;")
160             .addEscape('>',      "&gt;")
161             .addEscape('\u00A0', "&nbsp;")
162             .addEscape('\u00A1', "&iexcl;")
163             .addEscape('\u00A2', "&cent;")
164             .addEscape('\u00A3', "&pound;")
165             .addEscape('\u00A4', "&curren;")
166             .addEscape('\u00A5', "&yen;")
167             .addEscape('\u00A6', "&brvbar;")
168             .addEscape('\u00A7', "&sect;")
169             .addEscape('\u00A8', "&uml;")
170             .addEscape('\u00A9', "&copy;")
171             .addEscape('\u00AA', "&ordf;")
172             .addEscape('\u00AB', "&laquo;")
173             .addEscape('\u00AC', "&not;")
174             .addEscape('\u00AD', "&shy;")
175             .addEscape('\u00AE', "&reg;")
176             .addEscape('\u00AF', "&macr;")
177             .addEscape('\u00B0', "&deg;")
178             .addEscape('\u00B1', "&plusmn;")
179             .addEscape('\u00B2', "&sup2;")
180             .addEscape('\u00B3', "&sup3;")
181             .addEscape('\u00B4', "&acute;")
182             .addEscape('\u00B5', "&micro;")
183             .addEscape('\u00B6', "&para;")
184             .addEscape('\u00B7', "&middot;")
185             .addEscape('\u00B8', "&cedil;")
186             .addEscape('\u00B9', "&sup1;")
187             .addEscape('\u00BA', "&ordm;")
188             .addEscape('\u00BB', "&raquo;")
189             .addEscape('\u00BC', "&frac14;")
190             .addEscape('\u00BD', "&frac12;")
191             .addEscape('\u00BE', "&frac34;")
192             .addEscape('\u00BF', "&iquest;")
193             .addEscape('\u00C0', "&Agrave;")
194             .addEscape('\u00C1', "&Aacute;")
195             .addEscape('\u00C2', "&Acirc;")
196             .addEscape('\u00C3', "&Atilde;")
197             .addEscape('\u00C4', "&Auml;")
198             .addEscape('\u00C5', "&Aring;")
199             .addEscape('\u00C6', "&AElig;")
200             .addEscape('\u00C7', "&Ccedil;")
201             .addEscape('\u00C8', "&Egrave;")
202             .addEscape('\u00C9', "&Eacute;")
203             .addEscape('\u00CA', "&Ecirc;")
204             .addEscape('\u00CB', "&Euml;")
205             .addEscape('\u00CC', "&Igrave;")
206             .addEscape('\u00CD', "&Iacute;")
207             .addEscape('\u00CE', "&Icirc;")
208             .addEscape('\u00CF', "&Iuml;")
209             .addEscape('\u00D0', "&ETH;")
210             .addEscape('\u00D1', "&Ntilde;")
211             .addEscape('\u00D2', "&Ograve;")
212             .addEscape('\u00D3', "&Oacute;")
213             .addEscape('\u00D4', "&Ocirc;")
214             .addEscape('\u00D5', "&Otilde;")
215             .addEscape('\u00D6', "&Ouml;")
216             .addEscape('\u00D7', "&times;")
217             .addEscape('\u00D8', "&Oslash;")
218             .addEscape('\u00D9', "&Ugrave;")
219             .addEscape('\u00DA', "&Uacute;")
220             .addEscape('\u00DB', "&Ucirc;")
221             .addEscape('\u00DC', "&Uuml;")
222             .addEscape('\u00DD', "&Yacute;")
223             .addEscape('\u00DE', "&THORN;")
224             .addEscape('\u00DF', "&szlig;")
225             .addEscape('\u00E0', "&agrave;")
226             .addEscape('\u00E1', "&aacute;")
227             .addEscape('\u00E2', "&acirc;")
228             .addEscape('\u00E3', "&atilde;")
229             .addEscape('\u00E4', "&auml;")
230             .addEscape('\u00E5', "&aring;")
231             .addEscape('\u00E6', "&aelig;")
232             .addEscape('\u00E7', "&ccedil;")
233             .addEscape('\u00E8', "&egrave;")
234             .addEscape('\u00E9', "&eacute;")
235             .addEscape('\u00EA', "&ecirc;")
236             .addEscape('\u00EB', "&euml;")
237             .addEscape('\u00EC', "&igrave;")
238             .addEscape('\u00ED', "&iacute;")
239             .addEscape('\u00EE', "&icirc;")
240             .addEscape('\u00EF', "&iuml;")
241             .addEscape('\u00F0', "&eth;")
242             .addEscape('\u00F1', "&ntilde;")
243             .addEscape('\u00F2', "&ograve;")
244             .addEscape('\u00F3', "&oacute;")
245             .addEscape('\u00F4', "&ocirc;")
246             .addEscape('\u00F5', "&otilde;")
247             .addEscape('\u00F6', "&ouml;")
248             .addEscape('\u00F7', "&divide;")
249             .addEscape('\u00F8', "&oslash;")
250             .addEscape('\u00F9', "&ugrave;")
251             .addEscape('\u00FA', "&uacute;")
252             .addEscape('\u00FB', "&ucirc;")
253             .addEscape('\u00FC', "&uuml;")
254             .addEscape('\u00FD', "&yacute;")
255             .addEscape('\u00FE', "&thorn;")
256             .addEscape('\u00FF', "&yuml;")
257             .addEscape('\u0152', "&OElig;")
258             .addEscape('\u0153', "&oelig;")
259             .addEscape('\u0160', "&Scaron;")
260             .addEscape('\u0161', "&scaron;")
261             .addEscape('\u0178', "&Yuml;")
262             .addEscape('\u0192', "&fnof;")
263             .addEscape('\u02C6', "&circ;")
264             .addEscape('\u02DC', "&tilde;")
265             .addEscape('\u0391', "&Alpha;")
266             .addEscape('\u0392', "&Beta;")
267             .addEscape('\u0393', "&Gamma;")
268             .addEscape('\u0394', "&Delta;")
269             .addEscape('\u0395', "&Epsilon;")
270             .addEscape('\u0396', "&Zeta;")
271             .addEscape('\u0397', "&Eta;")
272             .addEscape('\u0398', "&Theta;")
273             .addEscape('\u0399', "&Iota;")
274             .addEscape('\u039A', "&Kappa;")
275             .addEscape('\u039B', "&Lambda;")
276             .addEscape('\u039C', "&Mu;")
277             .addEscape('\u039D', "&Nu;")
278             .addEscape('\u039E', "&Xi;")
279             .addEscape('\u039F', "&Omicron;")
280             .addEscape('\u03A0', "&Pi;")
281             .addEscape('\u03A1', "&Rho;")
282             .addEscape('\u03A3', "&Sigma;")
283             .addEscape('\u03A4', "&Tau;")
284             .addEscape('\u03A5', "&Upsilon;")
285             .addEscape('\u03A6', "&Phi;")
286             .addEscape('\u03A7', "&Chi;")
287             .addEscape('\u03A8', "&Psi;")
288             .addEscape('\u03A9', "&Omega;")
289             .addEscape('\u03B1', "&alpha;")
290             .addEscape('\u03B2', "&beta;")
291             .addEscape('\u03B3', "&gamma;")
292             .addEscape('\u03B4', "&delta;")
293             .addEscape('\u03B5', "&epsilon;")
294             .addEscape('\u03B6', "&zeta;")
295             .addEscape('\u03B7', "&eta;")
296             .addEscape('\u03B8', "&theta;")
297             .addEscape('\u03B9', "&iota;")
298             .addEscape('\u03BA', "&kappa;")
299             .addEscape('\u03BB', "&lambda;")
300             .addEscape('\u03BC', "&mu;")
301             .addEscape('\u03BD', "&nu;")
302             .addEscape('\u03BE', "&xi;")
303             .addEscape('\u03BF', "&omicron;")
304             .addEscape('\u03C0', "&pi;")
305             .addEscape('\u03C1', "&rho;")
306             .addEscape('\u03C2', "&sigmaf;")
307             .addEscape('\u03C3', "&sigma;")
308             .addEscape('\u03C4', "&tau;")
309             .addEscape('\u03C5', "&upsilon;")
310             .addEscape('\u03C6', "&phi;")
311             .addEscape('\u03C7', "&chi;")
312             .addEscape('\u03C8', "&psi;")
313             .addEscape('\u03C9', "&omega;")
314             .addEscape('\u03D1', "&thetasym;")
315             .addEscape('\u03D2', "&upsih;")
316             .addEscape('\u03D6', "&piv;")
317             .addEscape('\u2002', "&ensp;")
318             .addEscape('\u2003', "&emsp;")
319             .addEscape('\u2009', "&thinsp;")
320             .addEscape('\u200C', "&zwnj;")
321             .addEscape('\u200D', "&zwj;")
322             .addEscape('\u200E', "&lrm;")
323             .addEscape('\u200F', "&rlm;")
324             .addEscape('\u2013', "&ndash;")
325             .addEscape('\u2014', "&mdash;")
326             .addEscape('\u2018', "&lsquo;")
327             .addEscape('\u2019', "&rsquo;")
328             .addEscape('\u201A', "&sbquo;")
329             .addEscape('\u201C', "&ldquo;")
330             .addEscape('\u201D', "&rdquo;")
331             .addEscape('\u201E', "&bdquo;")
332             .addEscape('\u2020', "&dagger;")
333             .addEscape('\u2021', "&Dagger;")
334             .addEscape('\u2022', "&bull;")
335             .addEscape('\u2026', "&hellip;")
336             .addEscape('\u2030', "&permil;")
337             .addEscape('\u2032', "&prime;")
338             .addEscape('\u2033', "&Prime;")
339             .addEscape('\u2039', "&lsaquo;")
340             .addEscape('\u203A', "&rsaquo;")
341             .addEscape('\u203E', "&oline;")
342             .addEscape('\u2044', "&frasl;")
343             .addEscape('\u20AC', "&euro;")
344             .addEscape('\u2111', "&image;")
345             .addEscape('\u2118', "&weierp;")
346             .addEscape('\u211C', "&real;")
347             .addEscape('\u2122', "&trade;")
348             .addEscape('\u2135', "&alefsym;")
349             .addEscape('\u2190', "&larr;")
350             .addEscape('\u2191', "&uarr;")
351             .addEscape('\u2192', "&rarr;")
352             .addEscape('\u2193', "&darr;")
353             .addEscape('\u2194', "&harr;")
354             .addEscape('\u21B5', "&crarr;")
355             .addEscape('\u21D0', "&lArr;")
356             .addEscape('\u21D1', "&uArr;")
357             .addEscape('\u21D2', "&rArr;")
358             .addEscape('\u21D3', "&dArr;")
359             .addEscape('\u21D4', "&hArr;")
360             .addEscape('\u2200', "&forall;")
361             .addEscape('\u2202', "&part;")
362             .addEscape('\u2203', "&exist;")
363             .addEscape('\u2205', "&empty;")
364             .addEscape('\u2207', "&nabla;")
365             .addEscape('\u2208', "&isin;")
366             .addEscape('\u2209', "&notin;")
367             .addEscape('\u220B', "&ni;")
368             .addEscape('\u220F', "&prod;")
369             .addEscape('\u2211', "&sum;")
370             .addEscape('\u2212', "&minus;")
371             .addEscape('\u2217', "&lowast;")
372             .addEscape('\u221A', "&radic;")
373             .addEscape('\u221D', "&prop;")
374             .addEscape('\u221E', "&infin;")
375             .addEscape('\u2220', "&ang;")
376             .addEscape('\u2227', "&and;")
377             .addEscape('\u2228', "&or;")
378             .addEscape('\u2229', "&cap;")
379             .addEscape('\u222A', "&cup;")
380             .addEscape('\u222B', "&int;")
381             .addEscape('\u2234', "&there4;")
382             .addEscape('\u223C', "&sim;")
383             .addEscape('\u2245', "&cong;")
384             .addEscape('\u2248', "&asymp;")
385             .addEscape('\u2260', "&ne;")
386             .addEscape('\u2261', "&equiv;")
387             .addEscape('\u2264', "&le;")
388             .addEscape('\u2265', "&ge;")
389             .addEscape('\u2282', "&sub;")
390             .addEscape('\u2283', "&sup;")
391             .addEscape('\u2284', "&nsub;")
392             .addEscape('\u2286', "&sube;")
393             .addEscape('\u2287', "&supe;")
394             .addEscape('\u2295', "&oplus;")
395             .addEscape('\u2297', "&otimes;")
396             .addEscape('\u22A5', "&perp;")
397             .addEscape('\u22C5', "&sdot;")
398             .addEscape('\u2308', "&lceil;")
399             .addEscape('\u2309', "&rceil;")
400             .addEscape('\u230A', "&lfloor;")
401             .addEscape('\u230B', "&rfloor;")
402             .addEscape('\u2329', "&lang;")
403             .addEscape('\u232A', "&rang;")
404             .addEscape('\u25CA', "&loz;")
405             .addEscape('\u2660', "&spades;")
406             .addEscape('\u2663', "&clubs;")
407             .addEscape('\u2665', "&hearts;")
408             .addEscape('\u2666', "&diams;")
409             .toArray());
410   }
411 
412   /**
413    * Returns a {@link CharEscaper} instance that escapes special characters in a
414    * string so it can safely be included in an HTML document in either element
415    * content or attribute values.
416    *
417    * <p><b>Note</b></p>: does not alter non-ASCII and control characters.
418    */
asciiHtmlEscaper()419   public static CharEscaper asciiHtmlEscaper() {
420     return ASCII_HTML_ESCAPER;
421   }
422 
423   /**
424    * Escapes special characters from a string so it can safely be included in an
425    * HTML document in either element content or attribute values. Does
426    * <em>not</em> alter non-ASCII characters or control characters.
427    */
428   private static final CharEscaper ASCII_HTML_ESCAPER = new CharEscaperBuilder()
429       .addEscape('"', "&quot;")
430       .addEscape('\'', "&#39;")
431       .addEscape('&', "&amp;")
432       .addEscape('<', "&lt;")
433       .addEscape('>', "&gt;")
434       .toEscaper();
435 
436   /**
437    * Returns an {@link Escaper} instance that escapes Java chars so they can be
438    * safely included in URIs. For details on escaping URIs, see section 2.4 of
439    * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>.
440    *
441    * <p>When encoding a String, the following rules apply:
442    * <ul>
443    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
444    *     through "9" remain the same.
445    * <li>The special characters ".", "-", "*", and "_" remain the same.
446    * <li>The space character " " is converted into a plus sign "+".
447    * <li>All other characters are converted into one or more bytes using UTF-8
448    *     encoding and each byte is then represented by the 3-character string
449    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
450    *     representation of the byte value.
451    * <ul>
452    *
453    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
454    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
455    * RFC 3986</a>:<br>
456    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
457    * for all percent-encodings."</i>
458    *
459    * <p>This escaper has identical behavior to (but is potentially much faster
460    * than):
461    * <ul>
462    * <li>{@link com.google.httputil.FastURLEncoder#encode(String)}
463    * <li>{@link com.google.httputil.FastURLEncoder#encode(String,String)}
464    *     with the encoding name "UTF-8"
465    * <li>{@link java.net.URLEncoder#encode(String, String)}
466    *     with the encoding name "UTF-8"
467    * </ul>
468    *
469    * <p>This method is equivalent to {@code uriEscaper(true)}.
470    */
uriEscaper()471   public static Escaper uriEscaper() {
472     return uriEscaper(true);
473   }
474 
475   /**
476    * Returns an {@link Escaper} instance that escapes Java chars so they can be
477    * safely included in URI path segments. For details on escaping URIs, see
478    * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
479    *
480    * <p>When encoding a String, the following rules apply:
481    * <ul>
482    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
483    *     through "9" remain the same.
484    * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
485    * <li>The general delimiters "@" and ":" remain the same.
486    * <li>The subdelimiters "!", "$", "&amp;", "'", "(", ")", "*", ",", ";",
487    *     and "=" remain the same.
488    * <li>The space character " " is converted into %20.
489    * <li>All other characters are converted into one or more bytes using UTF-8
490    *     encoding and each byte is then represented by the 3-character string
491    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
492    *     representation of the byte value.
493    * </ul>
494    *
495    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
496    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
497    * RFC 3986</a>:<br>
498    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
499    * for all percent-encodings."</i>
500    */
uriPathEscaper()501   public static Escaper uriPathEscaper() {
502     return URI_PATH_ESCAPER;
503   }
504 
505   /**
506    * Returns an {@link Escaper} instance that escapes Java chars so they can be
507    * safely included in URI query string segments. When the query string
508    * consists of a sequence of name=value pairs separated by &amp;, the names
509    * and values should be individually encoded. If you escape an entire query
510    * string in one pass with this escaper, then the "=" and "&amp;" characters
511    * used as separators will also be escaped.
512    *
513    * <p>This escaper is also suitable for escaping fragment identifiers.
514    *
515    * <p>For details on escaping URIs, see
516    * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
517    *
518    * <p>When encoding a String, the following rules apply:
519    * <ul>
520    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
521    *     through "9" remain the same.
522    * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
523    * <li>The general delimiters "@" and ":" remain the same.
524    * <li>The path delimiters "/" and "?" remain the same.
525    * <li>The subdelimiters "!", "$", "'", "(", ")", "*", ",", and ";",
526    *     remain the same.
527    * <li>The space character " " is converted into %20.
528    * <li>The equals sign "=" is converted into %3D.
529    * <li>The ampersand "&amp;" is converted into %26.
530    * <li>All other characters are converted into one or more bytes using UTF-8
531    *     encoding and each byte is then represented by the 3-character string
532    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
533    *     representation of the byte value.
534    * </ul>
535    *
536    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
537    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
538    * RFC 3986</a>:<br>
539    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
540    * for all percent-encodings."</i>
541    *
542    * <p>This method is equivalent to {@code uriQueryStringEscaper(false)}.
543    */
uriQueryStringEscaper()544   public static Escaper uriQueryStringEscaper() {
545     return uriQueryStringEscaper(false);
546   }
547 
548   /**
549    * Returns a {@link Escaper} instance that escapes Java characters so they can
550    * be safely included in URIs. For details on escaping URIs, see section 2.4
551    * of <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>.
552    *
553    * <p>When encoding a String, the following rules apply:
554    * <ul>
555    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
556    *     through "9" remain the same.
557    * <li>The special characters ".", "-", "*", and "_" remain the same.
558    * <li>If {@code plusForSpace} was specified, the space character " " is
559    *     converted into a plus sign "+". Otherwise it is converted into "%20".
560    * <li>All other characters are converted into one or more bytes using UTF-8
561    *     encoding and each byte is then represented by the 3-character string
562    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
563    *     representation of the byte value.
564    * </ul>
565    *
566    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
567    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
568    * RFC 3986</a>:<br>
569    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
570    * for all percent-encodings."</i>
571    *
572    * @param plusForSpace if {@code true} space is escaped to {@code +} otherwise
573    *        it is escaped to {@code %20}. Although common, the escaping of
574    *        spaces as plus signs has a very ambiguous status in the relevant
575    *        specifications. You should prefer {@code %20} unless you are doing
576    *        exact character-by-character comparisons of URLs and backwards
577    *        compatibility requires you to use plus signs.
578    *
579    * @see #uriEscaper()
580    */
uriEscaper(boolean plusForSpace)581   public static Escaper uriEscaper(boolean plusForSpace) {
582     return plusForSpace ? URI_ESCAPER : URI_ESCAPER_NO_PLUS;
583   }
584 
585   /**
586    * Returns an {@link Escaper} instance that escapes Java chars so they can be
587    * safely included in URI query string segments. When the query string
588    * consists of a sequence of name=value pairs separated by &amp;, the names
589    * and values should be individually encoded. If you escape an entire query
590    * string in one pass with this escaper, then the "=" and "&amp;" characters
591    * used as separators will also be escaped.
592    *
593    * <p>This escaper is also suitable for escaping fragment identifiers.
594    *
595    * <p>For details on escaping URIs, see
596    * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
597    *
598    * <p>When encoding a String, the following rules apply:
599    * <ul>
600    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
601    *     through "9" remain the same.
602    * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
603    * <li>The general delimiters "@" and ":" remain the same.
604    * <li>The path delimiters "/" and "?" remain the same.
605    * <li>The subdelimiters "!", "$", "'", "(", ")", "*", ",", and ";",
606    *     remain the same.
607    * <li>If {@code plusForSpace} was specified, the space character " " is
608    *     converted into a plus sign "+". Otherwise it is converted into "%20".
609    * <li>The equals sign "=" is converted into %3D.
610    * <li>The ampersand "&amp;" is converted into %26.
611    * <li>All other characters are converted into one or more bytes using UTF-8
612    *     encoding and each byte is then represented by the 3-character string
613    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
614    *     representation of the byte value.
615    * </ul>
616    *
617    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
618    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
619    * RFC 3986</a>:<br>
620    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
621    * for all percent-encodings."</i>
622    *
623    * @param plusForSpace if {@code true} space is escaped to {@code +} otherwise
624    *        it is escaped to {@code %20}. Although common, the escaping of
625    *        spaces as plus signs has a very ambiguous status in the relevant
626    *        specifications. You should prefer {@code %20} unless you are doing
627    *        exact character-by-character comparisons of URLs and backwards
628    *        compatibility requires you to use plus signs.
629    *
630    * @see #uriQueryStringEscaper()
631    */
uriQueryStringEscaper(boolean plusForSpace)632   public static Escaper uriQueryStringEscaper(boolean plusForSpace) {
633     return plusForSpace ?
634            URI_QUERY_STRING_ESCAPER_WITH_PLUS : URI_QUERY_STRING_ESCAPER;
635   }
636 
637   private static final Escaper URI_ESCAPER =
638       new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, true);
639 
640   private static final Escaper URI_ESCAPER_NO_PLUS =
641       new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, false);
642 
643   private static final Escaper URI_PATH_ESCAPER =
644       new PercentEscaper(PercentEscaper.SAFEPATHCHARS_URLENCODER, false);
645 
646   private static final Escaper URI_QUERY_STRING_ESCAPER =
647       new PercentEscaper(PercentEscaper.SAFEQUERYSTRINGCHARS_URLENCODER, false);
648 
649   private static final Escaper URI_QUERY_STRING_ESCAPER_WITH_PLUS =
650       new PercentEscaper(PercentEscaper.SAFEQUERYSTRINGCHARS_URLENCODER, true);
651 
652   /**
653    * Returns a {@link Escaper} instance that escapes Java characters in a manner
654    * compatible with the C++ webutil/url URL class (the {@code kGoogle1Escape}
655    * set).
656    *
657    * <p>When encoding a String, the following rules apply:
658    * <ul>
659    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
660    * through "9" remain the same.
661    * <li>The special characters "!", "(", ")", "*", "-", ".", "_", "~", ",", "/"
662    * and ":" remain the same.
663    * <li>The space character " " is converted into a plus sign "+".
664    * <li>All other characters are converted into one or more bytes using UTF-8
665    *     encoding and each byte is then represented by the 3-character string
666    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
667    *     representation of the byte value.
668    * </ul>
669    *
670    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
671    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
672    * RFC 3986</a>:<br>
673    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
674    * for all percent-encodings."</i>
675    *
676    * <p><b>Note</b>: This escaper is a special case and is <em>not
677    * compliant</em> with <a href="http://www.ietf.org/rfc/rfc2396.txt">
678    * RFC 2396</a>. Specifically it will not escape "/", ":" and ",". This is
679    * only provided for certain limited use cases and you should favor using
680    * {@link #uriEscaper()} whenever possible.
681    */
cppUriEscaper()682   public static Escaper cppUriEscaper() {
683     return CPP_URI_ESCAPER;
684   }
685 
686   // Based on comments from FastURLEncoder:
687   // These octets mimic the ones escaped by the C++ webutil/url URL class --
688   // the kGoogle1Escape set.
689   // To produce the same escaping as C++, use this set with the plusForSpace
690   // option.
691   // WARNING: Contrary to RFC 2396 ",", "/" and ":" are listed as safe here.
692   private static final Escaper CPP_URI_ESCAPER =
693       new PercentEscaper("!()*-._~,/:", true);
694 
695   /**
696    * Returns a {@link CharEscaper} instance that escapes special characters in a
697    * string so it can safely be included in a Java string literal.
698    *
699    * <p><b>Note</b></p>: does not escape single quotes, so use the escaper
700    * returned by {@link #javaCharEscaper()} if you are generating char
701    * literals or if you are unsure.
702    */
javaStringEscaper()703   public static CharEscaper javaStringEscaper() {
704     return JAVA_STRING_ESCAPER;
705   }
706 
707   /**
708    * Escapes special characters from a string so it can safely be included in a
709    * Java string literal. Does <em>not</em> escape single-quotes, so use
710    * JAVA_CHAR_ESCAPE if you are generating char literals, or if you are unsure.
711    *
712    * <p>Note that non-ASCII characters will be octal or Unicode escaped.
713    */
714   private static final CharEscaper JAVA_STRING_ESCAPER
715       = new JavaCharEscaper(new CharEscaperBuilder()
716           .addEscape('\b', "\\b")
717           .addEscape('\f', "\\f")
718           .addEscape('\n', "\\n")
719           .addEscape('\r', "\\r")
720           .addEscape('\t', "\\t")
721           .addEscape('\"', "\\\"")
722           .addEscape('\\', "\\\\")
723           .toArray());
724 
725   /**
726    * Returns a {@link CharEscaper} instance that escapes special characters in a
727    * string so it can safely be included in a Java char or string literal. The
728    * behavior of this escaper is the same as that of the
729    * {@link #javaStringEscaper()}, except it also escapes single quotes.
730    */
javaCharEscaper()731   public static CharEscaper javaCharEscaper() {
732     return JAVA_CHAR_ESCAPER;
733   }
734 
735   /**
736    * Escapes special characters from a string so it can safely be included in a
737    * Java char literal or string literal.
738    *
739    * <p>Note that non-ASCII characters will be octal or Unicode escaped.
740    *
741    * <p>This is the same as {@link #JAVA_STRING_ESCAPER}, except that it escapes
742    * single quotes.
743    */
744   private static final CharEscaper JAVA_CHAR_ESCAPER
745       = new JavaCharEscaper(new CharEscaperBuilder()
746           .addEscape('\b', "\\b")
747           .addEscape('\f', "\\f")
748           .addEscape('\n', "\\n")
749           .addEscape('\r', "\\r")
750           .addEscape('\t', "\\t")
751           .addEscape('\'', "\\'")
752           .addEscape('\"', "\\\"")
753           .addEscape('\\', "\\\\")
754           .toArray());
755 
756   /**
757    * Returns a {@link CharEscaper} instance that replaces non-ASCII characters
758    * in a string with their Unicode escape sequences ({@code \\uxxxx} where
759    * {@code xxxx} is a hex number). Existing escape sequences won't be affected.
760    */
javaStringUnicodeEscaper()761   public static CharEscaper javaStringUnicodeEscaper() {
762     return JAVA_STRING_UNICODE_ESCAPER;
763   }
764 
765   /**
766    * Escapes each non-ASCII character in with its Unicode escape sequence
767    * {@code \\uxxxx} where {@code xxxx} is a hex number. Existing escape
768    * sequences won't be affected.
769    */
770   private static final CharEscaper JAVA_STRING_UNICODE_ESCAPER
771       = new CharEscaper() {
772           @Override protected char[] escape(char c) {
773             if (c <= 127) {
774               return null;
775             }
776 
777             char[] r = new char[6];
778             r[5] = HEX_DIGITS[c & 15];
779             c >>>= 4;
780             r[4] = HEX_DIGITS[c & 15];
781             c >>>= 4;
782             r[3] = HEX_DIGITS[c & 15];
783             c >>>= 4;
784             r[2] = HEX_DIGITS[c & 15];
785             r[1] = 'u';
786             r[0] = '\\';
787             return r;
788           }
789         };
790 
791   /**
792    * Returns a {@link CharEscaper} instance that escapes special characters from
793    * a string so it can safely be included in a Python string literal. Does not
794    * have any special handling for non-ASCII characters.
795    */
pythonEscaper()796   public static CharEscaper pythonEscaper() {
797     return PYTHON_ESCAPER;
798   }
799 
800   /**
801    * Escapes special characters in a string so it can safely be included in a
802    * Python string literal. Does not have any special handling for non-ASCII
803    * characters.
804    */
805   private static final CharEscaper PYTHON_ESCAPER = new CharEscaperBuilder()
806       // TODO(laurence): perhaps this should escape non-ASCII characters?
807       .addEscape('\n', "\\n")
808       .addEscape('\r', "\\r")
809       .addEscape('\t', "\\t")
810       .addEscape('\\', "\\\\")
811       .addEscape('\"', "\\\"")
812       .addEscape('\'', "\\\'")
813       .toEscaper();
814 
815   /**
816    * Returns a {@link CharEscaper} instance that escapes non-ASCII characters in
817    * a string so it can safely be included in a Javascript string literal.
818    * Non-ASCII characters are replaced with their ASCII javascript escape
819    * sequences (e.g., \\uhhhh or \xhh).
820    */
javascriptEscaper()821   public static CharEscaper javascriptEscaper() {
822     return JAVASCRIPT_ESCAPER;
823   }
824 
825   /**
826    * {@code CharEscaper} to escape javascript strings. Turns all non-ASCII
827    * characters into ASCII javascript escape sequences (e.g., \\uhhhh or \xhh).
828    */
829   private static final CharEscaper JAVASCRIPT_ESCAPER
830       = new JavascriptCharEscaper(new CharEscaperBuilder()
831           .addEscape('\'', "\\x27")
832           .addEscape('"',  "\\x22")
833           .addEscape('<',  "\\x3c")
834           .addEscape('=',  "\\x3d")
835           .addEscape('>',  "\\x3e")
836           .addEscape('&',  "\\x26")
837           .addEscape('\b', "\\b")
838           .addEscape('\t', "\\t")
839           .addEscape('\n', "\\n")
840           .addEscape('\f', "\\f")
841           .addEscape('\r', "\\r")
842           .addEscape('\\', "\\\\")
843           .toArray());
844 
newBasicXmlEscapeBuilder()845   private static CharEscaperBuilder newBasicXmlEscapeBuilder() {
846     return new CharEscaperBuilder()
847         .addEscape('&', "&amp;")
848         .addEscape('<', "&lt;")
849         .addEscape('>', "&gt;")
850         .addEscapes(new char[] {
851             '\000', '\001', '\002', '\003', '\004',
852             '\005', '\006', '\007', '\010', '\013',
853             '\014', '\016', '\017', '\020', '\021',
854             '\022', '\023', '\024', '\025', '\026',
855             '\027', '\030', '\031', '\032', '\033',
856             '\034', '\035', '\036', '\037'}, "");
857   }
858 
859   /**
860    * Returns a composite {@link CharEscaper} instance that tries to escape
861    * characters using a primary {@code CharEscaper} first and falls back to a
862    * secondary one if there is no escaping.
863    *
864    * <p>The returned escaper will attempt to escape each character using the
865    * primary escaper, and if the primary escaper has no escaping for that
866    * character, it will use the secondary escaper. If the secondary escaper has
867    * no escaping for a character either, the original character will be used.
868    * If the primary escaper has an escape for a character, the secondary escaper
869    * will not be used at all for that character; the escaped output of the
870    * primary is not run through the secondary. For a case where you would like
871    * to first escape with one escaper, and then with another, it is recommended
872    * that you call each escaper in order.
873    *
874    * @param primary The primary {@code CharEscaper} to use
875    * @param secondary The secondary {@code CharEscaper} to use if the first one
876    *     has no escaping rule for a character
877    * @throws NullPointerException if any of the arguments is null
878    */
fallThrough(CharEscaper primary, CharEscaper secondary)879   public static CharEscaper fallThrough(CharEscaper primary,
880       CharEscaper secondary) {
881     checkNotNull(primary);
882     checkNotNull(secondary);
883     return new FallThroughCharEscaper(primary, secondary);
884   }
885 
886   /**
887    * A fast {@link CharEscaper} that uses an array of replacement characters and
888    * a range of safe characters. It overrides {@link #escape(String)} to improve
889    * performance. Rough benchmarking shows that this almost doubles the speed
890    * when processing strings that do not require escaping (providing the escape
891    * test itself is efficient).
892    */
893   private static abstract class FastCharEscaper extends CharEscaper {
894 
895     protected final char[][] replacements;
896     protected final int replacementLength;
897     protected final char safeMin;
898     protected final char safeMax;
899 
FastCharEscaper(char[][] replacements, char safeMin, char safeMax)900     public FastCharEscaper(char[][] replacements, char safeMin, char safeMax) {
901       this.replacements = replacements;
902       this.replacementLength = replacements.length;
903       this.safeMin = safeMin;
904       this.safeMax = safeMax;
905     }
906 
907     /** Overridden for performance (see {@link FastCharEscaper}). */
escape(String s)908     @Override public String escape(String s) {
909       int slen = s.length();
910       for (int index = 0; index < slen; index++) {
911         char c = s.charAt(index);
912         if ((c < replacementLength && replacements[c] != null)
913             || c < safeMin || c > safeMax) {
914           return escapeSlow(s, index);
915         }
916       }
917       return s;
918     }
919   }
920 
921   /**
922    * Escaper for Java character escaping, contains both an array and a
923    * backup function.  We're not overriding the array decorator because we
924    * want to keep this as fast as possible, so no calls to super.escape first.
925    */
926   private static class JavaCharEscaper extends FastCharEscaper {
927 
JavaCharEscaper(char[][] replacements)928     public JavaCharEscaper(char[][] replacements) {
929       super(replacements, ' ', '~');
930     }
931 
escape(char c)932     @Override protected char[] escape(char c) {
933       // First check if our array has a valid escaping.
934       if (c < replacementLength) {
935         char[] r = replacements[c];
936         if (r != null) {
937           return r;
938         }
939       }
940 
941       // This range is un-escaped.
942       if (safeMin <= c && c <= safeMax) {
943         return null;
944       }
945 
946       if (c <= 0xFF) {
947         // Convert c to an octal-escaped string.
948         // Equivalent to String.format("\\%03o", (int)c);
949         char[] r = new char[4];
950         r[0] = '\\';
951         r[3] = HEX_DIGITS[c & 7];
952         c >>>= 3;
953         r[2] = HEX_DIGITS[c & 7];
954         c >>>= 3;
955         r[1] = HEX_DIGITS[c & 7];
956         return r;
957       }
958 
959       // Convert c to a hex-escaped string.
960       // Equivalent to String.format("\\u%04x", (int)c);
961       char[] r = new char[6];
962       r[0] = '\\';
963       r[1] = 'u';
964       r[5] = HEX_DIGITS[c & 15];
965       c >>>= 4;
966       r[4] = HEX_DIGITS[c & 15];
967       c >>>= 4;
968       r[3] = HEX_DIGITS[c & 15];
969       c >>>= 4;
970       r[2] = HEX_DIGITS[c & 15];
971       return r;
972     }
973   }
974 
975   /**
976    * Escaper for javascript character escaping, contains both an array and a
977    * backup function. We're not overriding the array decorator because we
978    * want to keep this as fast as possible, so no calls to super.escape first.
979    */
980   private static class JavascriptCharEscaper extends FastCharEscaper {
981 
JavascriptCharEscaper(char[][] replacements)982     public JavascriptCharEscaper(char[][] replacements) {
983       super(replacements, ' ', '~');
984     }
985 
escape(char c)986     @Override protected char[] escape(char c) {
987       // First check if our array has a valid escaping.
988       if (c < replacementLength) {
989         char[] r = replacements[c];
990         if (r != null) {
991           return r;
992         }
993       }
994 
995       // This range is unescaped.
996       if (safeMin <= c && c <= safeMax) {
997         return null;
998       }
999 
1000       // we can do a 2 digit hex escape for chars less that 0x100
1001       if (c < 0x100) {
1002         char[] r = new char[4];
1003         r[3] = HEX_DIGITS[c & 0xf];
1004         c >>>= 4;
1005         r[2] = HEX_DIGITS[c & 0xf];
1006         r[1] = 'x';
1007         r[0] = '\\';
1008         return r;
1009       }
1010 
1011       // 4 digit hex escape everything else
1012       char[] r = new char[6];
1013       r[5] = HEX_DIGITS[c & 0xf];
1014       c >>>= 4;
1015       r[4] = HEX_DIGITS[c & 0xf];
1016       c >>>= 4;
1017       r[3] = HEX_DIGITS[c & 0xf];
1018       c >>>= 4;
1019       r[2] = HEX_DIGITS[c & 0xf];
1020       r[1] = 'u';
1021       r[0] = '\\';
1022       return r;
1023     }
1024   }
1025 
1026   /**
1027    * Escaper for HTML character escaping, contains both an array and a
1028    * backup function.  We're not overriding the array decorator because we
1029    * want to keep this as fast as possible, so no calls to super.escape first.
1030    */
1031   private static class HtmlCharEscaper extends FastCharEscaper {
1032 
HtmlCharEscaper(char[][] replacements)1033     public HtmlCharEscaper(char[][] replacements) {
1034       super(replacements, Character.MIN_VALUE, '~');
1035     }
1036 
escape(char c)1037     @Override protected char[] escape(char c) {
1038       // First check if our array has a valid escaping.
1039       if (c < replacementLength) {
1040         char[] r = replacements[c];
1041         if (r != null) {
1042           return r;
1043         }
1044       }
1045 
1046       // ~ is ASCII 126, the highest value char that does not need
1047       // to be escaped
1048       if (c <= safeMax) {
1049         return null;
1050       }
1051 
1052       int index;
1053       if (c < 1000) {
1054         index = 4;
1055       } else if (c < 10000) {
1056         index = 5;
1057       } else {
1058         index = 6;
1059       }
1060       char[] result = new char[index + 2];
1061       result[0] = '&';
1062       result[1] = '#';
1063       result[index + 1] = ';';
1064 
1065       // TODO(sven): Convert this to a sequence of shifts/additions
1066       // to avoid the division and modulo operators.
1067       int intValue = c;
1068       for (; index > 1; index--) {
1069         result[index] = HEX_DIGITS[intValue % 10];
1070         intValue /= 10;
1071       }
1072       return result;
1073     }
1074   }
1075 
1076   /**
1077    * A composite {@code CharEscaper} object that tries to escape characters
1078    * using a primary {@code CharEscaper} first and falls back to a secondary
1079    * one if there is no escaping.
1080    */
1081   private static class FallThroughCharEscaper extends CharEscaper {
1082 
1083     private final CharEscaper primary;
1084     private final CharEscaper secondary;
1085 
FallThroughCharEscaper(CharEscaper primary, CharEscaper secondary)1086     public FallThroughCharEscaper(CharEscaper primary, CharEscaper secondary) {
1087       this.primary = primary;
1088       this.secondary = secondary;
1089     }
1090 
1091     @Override
escape(char c)1092     protected char[] escape(char c) {
1093       char result[] = primary.escape(c);
1094       if (result == null) {
1095         result = secondary.escape(c);
1096       }
1097       return result;
1098     }
1099   }
1100 
1101   private static final char[] HEX_DIGITS = "0123456789abcdef".toCharArray();
1102 }