• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 Jeff Ichnowski
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 //     * Redistributions of source code must retain the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer.
11 //
12 //     * Redistributions in binary form must reproduce the above
13 //       copyright notice, this list of conditions and the following
14 //       disclaimer in the documentation and/or other materials
15 //       provided with the distribution.
16 //
17 //     * Neither the name of the OWASP nor the names of its
18 //       contributors may be used to endorse or promote products
19 //       derived from this software without specific prior written
20 //       permission.
21 //
22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33 // OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 package org.owasp.encoder;
36 
37 import java.io.IOException;
38 import java.io.Writer;
39 import java.nio.CharBuffer;
40 import java.nio.charset.CoderResult;
41 
42 /**
43  * Encode -- fluent interface for contextual encoding.  Example usage in a JSP:
44  *
45  * <pre>
46  *     &lt;input value="&lt;%=Encode.forHtml(value)%&gt;" /&gt;
47  * </pre>
48  *
49  * <p>There are two versions of each contextual encoding method.  The first
50  * takes a {@code String} argument and returns the encoded version as a
51  * {@code String}.  The second version writes the encoded version directly
52  * to a {@code Writer}.</p>
53  *
54  * <p>Please make sure to read and understand the context that the method encodes
55  * for.  Encoding for the incorrect context will likely lead to exposing a
56  * cross-site scripting vulnerability. Those new to XSS mitigation may find it
57  * useful to read the
58  * <a href="https://cheatsheetseries.owasp.org/cheatsheets/Cross_Site_Scripting_Prevention_Cheat_Sheet.html">
59  * Cross Site Scripting Prevention Cheat Sheet</a> that is part of the OWASP Cheat Sheet series for background
60  * material.
61  * </p>
62  *
63  * @author Jeff Ichnowski
64  */
65 public final class Encode {
66     /** No instances. */
Encode()67     private Encode() {}
68 
69     /**
70      * <p>Encodes for (X)HTML text content and text attributes.  Since
71      * this method encodes for both contexts, it may be slightly less
72      * efficient to use this method over the methods targeted towards
73      * the specific contexts ({@link #forHtmlAttribute(String)} and
74      * {@link #forHtmlContent(String)}).  In general this method should
75      * be preferred unless you are really concerned with saving a few
76      * bytes or are writing a framework that utilizes this
77      * package.</p>
78      *
79      * <b>Example JSP Usage</b>
80      * <pre>
81      *     &lt;div&gt;&lt;%=Encode.forHtml(unsafeData)%&gt;&lt;/div&gt;
82      *
83      *     &lt;input value="&lt;%=Encode.forHtml(unsafeData)%&gt;" /&gt;
84      * </pre>
85      *
86      * <table border="0" class="memberSummary" summary="Shows the input and results of encoding">
87      *   <caption><b>Encoding&nbsp;Table</b></caption>
88      *   <thead>
89      *     <tr>
90      *       <th align="left" class="colFirst">Input</th>
91      *       <th align="left" class="colLast">Result</th>
92      *     </tr>
93      *   </thead>
94      *   <tbody>
95      *     <tr class="altColor">
96      *       <td class="colFirst">{@code &}</td>
97      *       <td class="colLast">{@code &amp;}</td>
98      *     </tr>
99      *     <tr class="rowColor">
100      *       <td class="colFirst">{@code <}</td>
101      *       <td class="colLast">{@code &lt;}</td>
102      *     </tr>
103      *     <tr class="altColor">
104      *       <td class="colFirst">{@code >}</td>
105      *       <td class="colLast">{@code &gt;}</td>
106      *     </tr>
107      *     <tr class="rowColor">
108      *       <td class="colFirst">{@code "}</td>
109      *       <td class="colLast">{@code &#34;}</td>
110      *     </tr>
111      *     <tr class="altColor">
112      *       <td class="colFirst">{@code '}</td>
113      *       <td class="colLast">{@code &#39;}</td>
114      *     </tr>
115      *   </tbody>
116      * </table>
117      *
118      * <p><b>Additional Notes</b></p>
119      * <ul>
120      * <li>The encoding of the greater-than sign ({@code >}) is not
121      * strictly required, but is included for maximum
122      * compatibility.</li>
123      *
124      * <li>Numeric encoding is used for double-quote character ({@code
125      * "}) as it shorter than the also valid {@code &quot;}.</li>
126      *
127      * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab
128      * (U+09) and space (U+20) are valid in quoted attributes and in
129      * block in an unescaped form.</li>
130      *
131      * <li>Surrogate pairs are passed through only if valid.</li>
132      *
133      * <li>Characters that are not <a
134      * href="http://www.w3.org/TR/REC-xml/#charsets">valid according
135      * to the XML specification</a> are replaced by a space character
136      * as they could lead to parsing errors.  In particular only {@code #x9
137      * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
138      * [#x10000-#x10FFFF]} are considered valid.</li>
139      * </ul>
140      *
141      * @param input the data to encode
142      * @return the data encoded for html.
143      */
forHtml(String input)144     public static String forHtml(String input) {
145         return forXml(input);
146     }
147 
148     /**
149      * See {@link #forHtml(String)} for description of encoding.  This
150      * version writes directly to a Writer without an intervening string.
151      *
152      * @param out where to write encoded output
153      * @param input the input string to encode
154      * @throws IOException if thrown by writer
155      */
forHtml(Writer out, String input)156     public static void forHtml(Writer out, String input) throws IOException {
157         forXml(out, input);
158     }
159 
160     /**
161      * <p>This method encodes for HTML text content.  It does not escape
162      * quotation characters and is thus unsafe for use with
163      * HTML attributes.  Use either {@link #forHtml(String)} or {@link #forHtmlAttribute(String)} for those
164      * methods.</p>
165      *
166      * <b>Example JSP Usage</b>
167      * <pre>
168      *     &lt;div&gt;&lt;%=Encode.forHtmlContent(unsafeData)%&gt;&lt;/div&gt;
169      * </pre>
170      * <table border="0" class="memberSummary" summary="Shows the input and results of encoding">
171      *   <caption><b>Encoding Table</b></caption>
172      *   <thead>
173      *     <tr>
174      *       <th align="left" class="colFirst">Input</th>
175      *       <th align="left" class="colLast">Result</th>
176      *     </tr>
177      *   </thead>
178      *   <tbody>
179      *     <tr class="altColor">
180      *       <td class="colFirst">{@code &}</td>
181      *       <td class="colLast">{@code &amp;}</td>
182      *     </tr>
183      *     <tr class="rowColor">
184      *       <td class="colFirst">{@code <}</td>
185      *       <td class="colLast">{@code &lt;}</td>
186      *     </tr>
187      *     <tr class="altColor">
188      *       <td class="colFirst">{@code >}</td>
189      *       <td class="colLast">{@code &gt;}</td>
190      *     </tr>
191      *   </tbody>
192      * </table>
193      *
194      * <p><b>Additional Notes</b></p>
195      * <ul>
196      * <li>Single-quote character ({@code '}) and double-quote
197      * character ({@code "}) do not require encoding in HTML
198      * blocks, unlike other HTML contexts.</li>
199      *
200      * <li>The encoding of the greater-than sign ({@code >}) is not
201      * strictly required, but is included for maximum
202      * compatibility.</li>
203      *
204      * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab
205      * (U+09) and space (U+20) are valid in quoted attributes and in
206      * block in an unescaped form.</li>
207      *
208      * <li>Surrogate pairs are passed through only if valid.</li>
209      *
210      * <li>Characters that are not <a
211      * href="http://www.w3.org/TR/REC-xml/#charsets">valid according
212      * to the XML specification</a> are replaced by a space character
213      * as they could lead to parsing errors.  In particular only {@code #x9
214      * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
215      * [#x10000-#x10FFFF]} are considered valid.</li>
216      * </ul>
217      *
218      * @param input the input to encode
219      * @return the encoded result
220      */
forHtmlContent(String input)221     public static String forHtmlContent(String input) {
222         return forXmlContent(input);
223     }
224 
225     /**
226      * See {@link #forHtmlContent(String)} for description of encoding.  This
227      * version writes directly to a Writer without an intervening string.
228      *
229      * @param out where to write encoded output
230      * @param input the input string to encode
231      * @throws IOException if thrown by writer
232      */
forHtmlContent(Writer out, String input)233     public static void forHtmlContent(Writer out, String input)
234         throws IOException
235     {
236         forXmlContent(out, input);
237     }
238 
239     /**
240      * <p>This method encodes for HTML text attributes. Do not use for JavaScript event attributes or for attributes
241      * that are interpreted as a URL. Instead use {@link #forJavaScript(String)} and {@link #forUriComponent(String)}
242      * respectively for those.</p>
243      *
244      * <b>Example JSP Usage</b>
245      * <pre>
246      *     &lt;input value=&quot;&lt;%=Encode.forHtmlAttribute(unsafeData)%&gt;&quot; title=&#39;&lt;%=Encode.forHtmlAttribute(moreUnsafeData)%&gt;&#39; /&gt;
247      * </pre>
248      *
249      * <table border="0" class="memberSummary" summary="Shows the input and results of encoding">
250      *   <caption><b>Encoding Table</b></caption>
251      *   <thead>
252      *     <tr>
253      *       <th align="left" class="colFirst">Input</th>
254      *       <th align="left" class="colLast">Result</th>
255      *     </tr>
256      *   </thead>
257      *   <tbody>
258      *     <tr class="altColor">
259      *       <td class="colFirst">{@code &}</td>
260      *       <td class="colLast">{@code &amp;}</td>
261      *     </tr>
262      *     <tr class="rowColor">
263      *       <td class="colFirst">{@code <}</td>
264      *       <td class="colLast">{@code &lt;}</td>
265      *     </tr>
266      *     <tr class="altColor">
267      *       <td class="colFirst">{@code "}</td>
268      *       <td class="colLast">{@code &#34;}</td>
269      *     </tr>
270      *     <tr class="rowColor">
271      *       <td class="colFirst">{@code '}</td>
272      *       <td class="colLast">{@code &#39;}</td>
273      *     </tr>
274      *   </tbody>
275      * </table>
276      *
277      * <p><b>Additional Notes</b></p>
278      * <ul>
279      * <li>When using this method, the caller must provide quotes around the attribute value.</li>
280      *
281      * <li>Both the single-quote character ({@code '}) and the
282      * double-quote character ({@code "}) are encoded so this is safe
283      * for HTML attributes with either enclosing character.</li>
284      *
285      * <li>The encoding of the greater-than sign ({@code >}) is not
286      * required for attributes.</li>
287      *
288      * <li>Numeric encoding is used for double-quote character ({@code
289      * "}) as it shorter than the also valid {@code &quot;}.</li>
290      *
291      * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab
292      * (U+09) and space (U+20) are valid in quoted attributes and in
293      * block in an unescaped form.</li>
294      *
295      * <li>Surrogate pairs are passed through only if valid.</li>
296      *
297      * <li>Characters that are not <a
298      * href="http://www.w3.org/TR/REC-xml/#charsets">valid according
299      * to the XML specification</a> are replaced by a space character
300      * as they could lead to parsing errors.  In particular only {@code #x9
301      * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
302      * [#x10000-#x10FFFF]} are considered valid.</li>
303      * </ul>
304      *
305      * @param input the input to encode
306      * @return the encoded result
307      */
forHtmlAttribute(String input)308     public static String forHtmlAttribute(String input) {
309         return forXmlAttribute(input);
310     }
311 
312     /**
313      * See {@link #forHtmlAttribute(String)} for description of encoding.  This
314      * version writes directly to a Writer without an intervening string.
315      *
316      * @param out where to write encoded output
317      * @param input the input string to encode
318      * @throws IOException if thrown by writer
319      */
forHtmlAttribute(Writer out, String input)320     public static void forHtmlAttribute(Writer out, String input)
321         throws IOException
322     {
323         forXmlAttribute(out, input);
324     }
325 
326 
327     /**
328      * <p>Encodes for unquoted HTML attribute values.  {@link
329      * #forHtml(String)} or {@link #forHtmlAttribute(String)} should
330      * usually be preferred over this method as quoted attributes are
331      * XHTML compliant.</p>
332      *
333      * <p>When using this method, the caller is not required to
334      * provide quotes around the attribute (since it is encoded for
335      * such context).  The caller should make sure that the attribute
336      * value does not abut unsafe characters--and thus should usually
337      * err on the side of including a space character after the
338      * value.</p>
339      *
340      * <p>Use of this method is discouraged as quoted attributes are
341      * generally more compatible and safer.  Also note, that no
342      * attempt has been made to optimize this encoding, though it is
343      * still probably faster than other encoding libraries.</p>
344      *
345      * <b>Example JSP Usage</b>
346      * <pre>
347      *     &lt;input value=&lt;%=Encode.forHtmlUnquotedAttribute(input)%&gt; &gt;
348      * </pre>
349      *
350      * <table border="0" class="memberSummary" summary="Shows the input and results of encoding">
351      *   <caption><b>Encoding Table</b></caption>
352      *   <thead>
353      *     <tr>
354      *       <th align="left" class="colFirst">Input</th>
355      *       <th align="left" class="colLast">Result</th>
356      *     </tr>
357      *   </thead>
358      *   <tbody>
359      *     <tr class="altColor">
360      *         <td class="colFirst">{@code U+0009} (horizontal tab)</td>
361      *         <td class="colLast">{@code &#9;}</td></tr>
362      *     <tr class="rowColor">
363      *         <td class="colFirst">{@code U+000A} (line feed)</td>
364      *         <td class="colLast">{@code &#10;}</td></tr>
365      *     <tr class="altColor">
366      *         <td class="colFirst">{@code U+000C} (form feed)</td>
367      *         <td class="colLast">{@code &#12;}</td></tr>
368      *     <tr class="rowColor">
369      *         <td class="colFirst">{@code U+000D} (carriage return)</td>
370      *         <td class="colLast">{@code &#13;}</td></tr>
371      *     <tr class="altColor">
372      *         <td class="colFirst">{@code U+0020} (space)</td>
373      *         <td class="colLast">{@code &#32;}</td></tr>
374      *     <tr class="rowColor">
375      *         <td class="colFirst">{@code &}</td>
376      *         <td class="colLast">{@code &amp;}</td></tr>
377      *     <tr class="altColor">
378      *         <td class="colFirst">{@code <}</td>
379      *         <td class="colLast">{@code &lt;}</td></tr>
380      *     <tr class="rowColor">
381      *         <td class="colFirst">{@code >}</td>
382      *         <td class="colLast">{@code &gt;}</td></tr>
383      *     <tr class="altColor">
384      *         <td class="colFirst">{@code "}</td>
385      *         <td class="colLast">{@code &#34;}</td></tr>
386      *     <tr class="rowColor">
387      *         <td class="colFirst">{@code '}</td>
388      *         <td class="colLast">{@code &#39;}</td></tr>
389      *     <tr class="altColor">
390      *         <td class="colFirst">{@code /}</td>
391      *         <td class="colLast">{@code &#47;}</td></tr>
392      *     <tr class="rowColor">
393      *         <td class="colFirst">{@code =}</td>
394      *         <td class="colLast">{@code &#61;}</td></tr>
395      *     <tr class="altColor">
396      *         <td class="colFirst">{@code `}</td>
397      *         <td class="colLast">{@code &#96;}</td></tr>
398      *     <tr class="rowColor">
399      *         <td class="colFirst">{@code U+0085} (next line)</td>
400      *         <td class="colLast">{@code &#133;}</td></tr>
401      *     <tr class="altColor">
402      *         <td class="colFirst">{@code U+2028} (line separator)</td>
403      *         <td class="colLast">{@code &#8232;}</td></tr>
404      *     <tr class="rowColor">
405      *         <td class="colFirst">{@code U+2029} (paragraph separator)</td>
406      *         <td class="colLast">{@code &#8233;}</td></tr>
407      *   </tbody>
408      * </table>
409      *
410      * <p><b>Additional Notes</b></p>
411      * <ul>
412      * <li>The following characters are <i>not</i> encoded:
413      * {@code 0-9, a-z, A-Z}, {@code !}, {@code
414      * #}, {@code $}, {@code %},
415      * {@code (}, {@code )}, {@code
416      * *}, {@code +}, {@code ,},
417      * {@code -}, {@code .}, {@code
418      * [}, {@code \}, {@code ]},
419      * {@code ^}, {@code _}, {@code
420      * }}.</li>
421      *
422      * <li>Surrogate pairs are passed through only if valid.  Invalid
423      * surrogate pairs are replaced by a hyphen (-).</li>
424      *
425      * <li>Characters in the C0 and C1 control blocks and not
426      * otherwise listed above are considered invalid and replaced by a
427      * hyphen (-) character.</li>
428      *
429      * <li>Unicode "non-characters" are replaced by hyphens (-).</li>
430      * </ul>
431      *
432      * @param input the attribute value to be encoded.
433      * @return the attribute value encoded for unquoted attribute
434      * context.
435      */
forHtmlUnquotedAttribute(String input)436     public static String forHtmlUnquotedAttribute(String input) {
437         return encode(Encoders.HTML_UNQUOTED_ATTRIBUTE_ENCODER, input);
438     }
439 
440     /**
441      * See {@link #forHtmlUnquotedAttribute(String)} for description of encoding.  This
442      * version writes directly to a Writer without an intervening string.
443      *
444      * @param out where to write encoded output
445      * @param input the input string to encode
446      * @throws IOException if thrown by writer
447      */
forHtmlUnquotedAttribute(Writer out, String input)448     public static void forHtmlUnquotedAttribute(Writer out, String input)
449         throws IOException
450     {
451         encode(Encoders.HTML_UNQUOTED_ATTRIBUTE_ENCODER, out, input);
452     }
453 
454 
455     // HTML comment encoding is not currently supported because
456     // of the number of vendor-specific sequences that would need
457     // to be handled (e.g. "<!--[if IE]-->"
458 
459 //    public static String forHtmlComment(String input) {
460 //        // only alphanumeric and space, everything else becomes a space
461 //
462 //        // HTML comment context needs to avoid browser extensions
463 //        // such as "<!--[if IE]-->"
464 //        throw new UnsupportedOperationException();
465 //    }
466 
467     /**
468      * Encodes for CSS strings.  The context must be surrounded by quotation
469      * characters.  It is safe for use in both style blocks and attributes in
470      * HTML.
471      *
472      * <b>Example JSP Usage</b>
473      * <pre>
474      *     &lt;div style="background: url('&lt;=Encode.forCssString(...)%&gt;');"&gt;
475      *
476      *     &lt;style type="text/css"&gt;
477      *         background: url('&lt;%=Encode.forCssString(...)%&gt;');
478      *     &lt;/style&gt;
479      * </pre>
480      *
481      * <b>Encoding  Notes</b>
482      * <ul>
483      *
484      * <li>The following characters are encoded using hexadecimal
485      * encodings: {@code U+0000} - {@code U+001f},
486      * {@code "},
487      * {@code '},
488      * {@code \},
489      * {@code <},
490      * {@code &},
491      * {@code /},
492      * {@code >},
493      * {@code U+007f},
494      * line separator ({@code U+2028}),
495      * paragraph separator ({@code U+2029}).</li>
496      *
497      * <li>Any character requiring encoding is encoded as {@code \xxx}
498      * where {@code xxx} is the shortest hexadecimal representation of
499      * its Unicode code point (after decoding surrogate pairs if
500      * necessary).  This encoding is never zero padded.  Thus, for
501      * example, the tab character is encoded as {@code \9}, not {@code
502      * \0009}.</li>
503      *
504      * <li>The encoder looks ahead 1 character in the input and
505      * appends a space to an encoding to avoid the next character
506      * becoming part of the hexadecimal encoded sequence.  Thus
507      * &ldquo;{@code '1}&rdquo; is encoded as &ldquo;{@code \27
508      * 1}&rdquo;, and not as &ldquo;{@code \271}&rdquo;.  If a space
509      * is not necessary, it is not included, thus &ldquo;{@code
510      * 'x}&rdquo; is encoded as &ldquo;{@code \27x}&rdquo;, and not as
511      * &ldquo;{@code \27 x}&rdquo;.</li>
512      *
513      * <li>Surrogate pairs are passed through only if valid.  Invalid
514      * surrogate pairs are replaced by an underscore (_).</li>
515      *
516      * <li>Unicode "non-characters" are replaced by underscores (_).</li>
517      *
518      * </ul>
519      *
520      * @param input the input to encode
521      * @return the encoded result
522      */
forCssString(String input)523     public static String forCssString(String input) {
524         // need to watch out for CSS expressions
525         return encode(Encoders.CSS_STRING_ENCODER, input);
526     }
527 
528     /**
529      * See {@link #forCssString(String)} for description of encoding.  This
530      * version writes directly to a Writer without an intervening string.
531      *
532      * @param out where to write encoded output
533      * @param input the input string to encode
534      * @throws IOException if thrown by writer
535      */
forCssString(Writer out, String input)536     public static void forCssString(Writer out, String input)
537         throws IOException
538     {
539         encode(Encoders.CSS_STRING_ENCODER, out, input);
540     }
541 
542     /**
543      * Encodes for CSS URL contexts.  The context must be surrounded by {@code "url("}
544      * and {@code ")"}.  It is safe for use in both style blocks and attributes in HTML.
545      * Note: this does not do any checking on the quality or safety of the URL
546      * itself.  The caller should insure that the URL is safe for embedding
547      * (e.g. input validation) by other means.
548      *
549      * <b>Example JSP Usage</b>
550      * <pre>
551      *     &lt;div style="background:url(&lt;=Encode.forCssUrl(...)%&gt;);"&gt;
552      *
553      *     &lt;style type="text/css"&gt;
554      *         background: url('&lt;%=Encode.forCssUrl(...)%&gt;');
555      *     &lt;/style&gt;
556      * </pre>
557      * <b>Encoding  Notes</b>
558      * <ul>
559      *
560      * <li>The following characters are encoded using hexadecimal
561      * encodings: {@code U+0000} - {@code U+001f},
562      * {@code "},
563      * {@code '},
564      * {@code \},
565      * {@code <},
566      * {@code &},
567      * {@code /},
568      * {@code >},
569      * {@code U+007f},
570      * line separator ({@code U+2028}),
571      * paragraph separator ({@code U+2029}).</li>
572      *
573      * <li>Any character requiring encoding is encoded as {@code \xxx}
574      * where {@code xxx} is the shortest hexadecimal representation of
575      * its Unicode code point (after decoding surrogate pairs if
576      * necessary).  This encoding is never zero padded.  Thus, for
577      * example, the tab character is encoded as {@code \9}, not {@code
578      * \0009}.</li>
579      *
580      * <li>The encoder looks ahead 1 character in the input and
581      * appends a space to an encoding to avoid the next character
582      * becoming part of the hexadecimal encoded sequence.  Thus
583      * &ldquo;{@code '1}&rdquo; is encoded as &ldquo;{@code \27
584      * 1}&rdquo;, and not as &ldquo;{@code \271}&rdquo;.  If a space
585      * is not necessary, it is not included, thus &ldquo;{@code
586      * 'x}&rdquo; is encoded as &ldquo;{@code \27x}&rdquo;, and not as
587      * &ldquo;{@code \27 x}&rdquo;.</li>
588      *
589      * <li>Surrogate pairs are passed through only if valid.  Invalid
590      * surrogate pairs are replaced by an underscore (_).</li>
591      *
592      * <li>Unicode "non-characters" are replaced by underscores (_).</li>
593      *
594      * </ul>
595      *
596      * @param input the input to encode
597      * @return the encoded result
598      */
forCssUrl(String input)599     public static String forCssUrl(String input) {
600         return encode(Encoders.CSS_URL_ENCODER, input);
601     }
602 
603     /**
604      * See {@link #forCssUrl(String)} for description of encoding.  This
605      * version writes directly to a Writer without an intervening string.
606      *
607      * @param out where to write encoded output
608      * @param input the input string to encode
609      * @throws IOException if thrown by writer
610      */
forCssUrl(Writer out, String input)611     public static void forCssUrl(Writer out, String input)
612         throws IOException
613     {
614         encode(Encoders.CSS_URL_ENCODER, out, input);
615     }
616 
617     /**
618      * <p>Performs percent-encoding of a URL according to RFC 3986.  The provided
619      * URL is assumed to a valid URL.  This method does not do any checking on
620      * the quality or safety of the URL itself.  In many applications it may
621      * be better to use {@link java.net.URI} instead.  Note: this is a
622      * particularly dangerous context to put untrusted content in, as for
623      * example a "javascript:" URL provided by a malicious user would be
624      * "properly" escaped, and still execute.</p>
625      *
626      * <b>Encoding Table</b>
627      * <p>The following characters are <i>not</i> encoded:</p>
628      * <pre>
629      * U+20:   !   # $   &amp; ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ;   =   ?
630      * U+40: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [   ]   _
631      * U+60:   a b c d e f g h i j k l m n o p q r s t u v w x y z       ~
632      * </pre>
633      *
634      * <b>Encoding Notes</b>
635      * <ul>
636      *
637      *   <li>The single-quote character({@code '}) <b>is not encoded</b>.</li>
638      *
639      *   <li>This encoding is not intended to be used standalone.  The
640      *   output should be encoded to the target context.  For example:
641      *   {@code <a
642      *   href="<%=Encode.forHtmlAttribute(Encode.forUri(uri))%>">...</a>}.
643      *   (Note, the single-quote character ({@code '}) is not
644      *   encoded.)</li>
645      *
646      *   <li>URL encoding is an encoding for bytes, not unicode.  The
647      *   input string is thus first encoded as a sequence of UTF-8
648      *   byte.  The bytes are then encoded as {@code %xx} where {@code
649      *   xx} is the two-digit hexadecimal representation of the
650      *   byte. (The implementation does this as one step for
651      *   performance.)</li>
652      *
653      *   <li>Surrogate pairs are first decoded to a Unicode code point
654      *   before encoding as UTF-8.</li>
655      *
656      *   <li>Invalid characters (e.g. partial or invalid surrogate
657      *   pairs), are replaced with a hyphen ({@code -}) character.</li>
658      *
659      * </ul>
660      *
661      * @param input the input to encode
662      * @return the encoded result
663      */
forUri(String input)664     @Deprecated public static String forUri(String input) {
665         return encode(Encoders.URI_ENCODER, input);
666     }
667 
668     /**
669      * See {@link #forUri(String)} for description of encoding.  This
670      * version writes directly to a Writer without an intervening string.
671      *
672      * @param out where to write encoded output
673      * @param input the input string to encode
674      * @throws IOException if thrown by writer
675      *
676      * @deprecated  There is never a need to encode a complete URI with this form of encoding.
677      */
forUri(Writer out, String input)678     @Deprecated public static void forUri(Writer out, String input)
679         throws IOException
680     {
681         encode(Encoders.URI_ENCODER, out, input);
682     }
683 
684     /**
685      * Performs percent-encoding for a component of a URI, such as a query
686      * parameter name or value, path or query-string.  In particular this
687      * method insures that special characters in the component do not get
688      * interpreted as part of another component.
689      *
690      * <pre>
691      *     &lt;a href="http://www.owasp.org/&lt;%=Encode.forUriComponent(...)%&gt;?query#fragment"&gt;
692      *
693      *     &lt;a href="/search?value=&lt;%=Encode.forUriComponent(...)%&gt;&amp;order=1#top"&gt;
694      * </pre>
695      *
696      * <b>Encoding Table</b>
697      * <p>The following characters are <i>not</i> encoded:</p>
698      * <pre>
699      * U+20:                           - .   0 1 2 3 4 5 6 7 8 9
700      * U+40:   A B C D E F G H I J K L M N O P Q R S T U V W X Y Z         _
701      * U+60:   a b c d e f g h i j k l m n o p q r s t u v w x y z       ~
702      * </pre>
703      *
704      * <b>Encoding Notes</b>
705      * <ul>
706      *
707      *   <li>Unlike {@link #forUri(String)} this method is safe to be
708      *   used in most containing contexts, including: HTML/XML, CSS,
709      *   and JavaScript contexts.</li>
710      *
711      *   <li>URL encoding is an encoding for bytes, not unicode.  The
712      *   input string is thus first encoded as a sequence of UTF-8
713      *   byte.  The bytes are then encoded as {@code %xx} where {@code
714      *   xx} is the two-digit hexadecimal representation of the
715      *   byte. (The implementation does this as one step for
716      *   performance.)</li>
717      *
718      *   <li>Surrogate pairs are first decoded to a Unicode code point
719      *   before encoding as UTF-8.</li>
720      *
721      *   <li>Invalid characters (e.g. partial or invalid surrogate
722      *   pairs), are replaced with a hyphen ({@code -}) character.</li>
723      *
724      * </ul>
725      *
726      * @param input the input to encode
727      * @return the encoded result
728      */
forUriComponent(String input)729     public static String forUriComponent(String input) {
730         return encode(Encoders.URI_COMPONENT_ENCODER, input);
731     }
732 
733     /**
734      * See {@link #forUriComponent(String)} for description of encoding.  This
735      * version writes directly to a Writer without an intervening string.
736      *
737      * @param out where to write encoded output
738      * @param input the input string to encode
739      * @throws IOException if thrown by writer
740      */
forUriComponent(Writer out, String input)741     public static void forUriComponent(Writer out, String input)
742         throws IOException
743     {
744         encode(Encoders.URI_COMPONENT_ENCODER, out, input);
745     }
746 
747     /**
748      * Encoder for XML and XHTML.  See {@link #forHtml(String)} for a
749      * description of the encoding and context.
750      *
751      * @see #forHtml(String)
752      * @param input the input to encode
753      * @return the encoded result
754      */
forXml(String input)755     public static String forXml(String input) {
756         return encode(Encoders.XML_ENCODER, input);
757     }
758 
759     /**
760      * See {@link #forXml(String)} for description of encoding.  This
761      * version writes directly to a Writer without an intervening string.
762      *
763      * @param out where to write encoded output
764      * @param input the input string to encode
765      * @throws IOException if thrown by writer
766      */
forXml(Writer out, String input)767     public static void forXml(Writer out, String input)
768         throws IOException
769     {
770         encode(Encoders.XML_ENCODER, out, input);
771     }
772 
773     /**
774      * Encoder for XML and XHTML text content.  See {@link
775      * #forHtmlContent(String)} for description of encoding and
776      * context.
777      *
778      * @see #forHtmlContent(String)
779      * @param input the input to encode
780      * @return the encoded result
781      */
forXmlContent(String input)782     public static String forXmlContent(String input) {
783         return encode(Encoders.XML_CONTENT_ENCODER, input);
784     }
785 
786     /**
787      * See {@link #forXmlContent(String)} for description of encoding.  This
788      * version writes directly to a Writer without an intervening string.
789      *
790      * @param out where to write encoded output
791      * @param input the input string to encode
792      * @throws IOException if thrown by writer
793      */
forXmlContent(Writer out, String input)794     public static void forXmlContent(Writer out, String input)
795         throws IOException
796     {
797         encode(Encoders.XML_CONTENT_ENCODER, out, input);
798     }
799 
800     /**
801      * Encoder for XML and XHTML attribute content.  See {@link
802      * #forHtmlAttribute(String)} for description of encoding and
803      * context.
804      *
805      * @see #forHtmlAttribute(String)
806      * @param input the input to encode
807      * @return the encoded result
808      */
forXmlAttribute(String input)809     public static String forXmlAttribute(String input) {
810         return encode(Encoders.XML_ATTRIBUTE_ENCODER, input);
811     }
812 
813     /**
814      * See {@link #forXmlAttribute(String)} for description of encoding.  This
815      * version writes directly to a Writer without an intervening string.
816      *
817      * @param out where to write encoded output
818      * @param input the input string to encode
819      * @throws IOException if thrown by writer
820      */
forXmlAttribute(Writer out, String input)821     public static void forXmlAttribute(Writer out, String input)
822         throws IOException
823     {
824         encode(Encoders.XML_ATTRIBUTE_ENCODER, out, input);
825     }
826 
827     /**
828      * Encoder for XML comments.  <strong>NOT FOR USE WITH
829      * (X)HTML CONTEXTS.</strong>  (X)HTML comments may be interpreted by
830      * browsers as something other than a comment, typically in vendor
831      * specific extensions (e.g. {@code <--if[IE]-->}).
832      * For (X)HTML it is recommend that unsafe content never be included
833      * in a comment.
834      *
835      * <p>The caller must provide the comment start and end sequences.</p>
836      *
837      * <p>This method replaces all invalid XML characters with spaces,
838      * and replaces the "--" sequence (which is invalid in XML comments)
839      * with "-~" (hyphen-tilde).  <b>This encoding behavior may change
840      * in future releases.</b>  If the comments need to be decoded, the
841      * caller will need to come up with their own encode/decode system.</p>
842      *
843      * <pre>
844      *     out.println("&lt;?xml version='1.0'?&gt;");
845      *     out.println("&lt;data&gt;");
846      *     out.println("&lt;!-- "+Encode.forXmlComment(comment)+" --&gt;");
847      *     out.println("&lt;/data&gt;");
848      * </pre>
849      *
850      * @param input the input to encode
851      * @return the encoded result
852      */
forXmlComment(String input)853     public static String forXmlComment(String input) {
854         return encode(Encoders.XML_COMMENT_ENCODER, input);
855     }
856 
857     /**
858      * See {@link #forXmlComment(String)} for description of encoding.  This
859      * version writes directly to a Writer without an intervening string.
860      *
861      * @param out where to write encoded output
862      * @param input the input string to encode
863      * @throws IOException if thrown by writer
864      */
forXmlComment(Writer out, String input)865     public static void forXmlComment(Writer out, String input)
866         throws IOException
867     {
868         encode(Encoders.XML_COMMENT_ENCODER, out, input);
869     }
870 
871     /**
872      * Encodes data for an XML CDATA section.  On the chance that the input
873      * contains a terminating {@code "]]>"}, it will be replaced by
874      * {@code "]]>]]<![CDATA[>"}.
875      * As with all XML contexts, characters that are invalid according to the
876      * XML specification will be replaced by a space character.   Caller must
877      * provide the CDATA section boundaries.
878      *
879      * <pre>
880      *     &lt;xml-data&gt;&lt;![CDATA[&lt;%=Encode.forCDATA(...)%&gt;]]&gt;&lt;/xml-data&gt;
881      * </pre>
882      *
883      * @param input the input to encode
884      * @return the encoded result
885      */
forCDATA(String input)886     public static String forCDATA(String input) {
887         return encode(Encoders.CDATA_ENCODER, input);
888     }
889 
890     /**
891      * See {@link #forCDATA(String)} for description of encoding.  This
892      * version writes directly to a Writer without an intervening string.
893      *
894      * @param out where to write encoded output
895      * @param input the input string to encode
896      * @throws IOException if thrown by writer
897      */
forCDATA(Writer out, String input)898     public static void forCDATA(Writer out, String input)
899         throws IOException
900     {
901         encode(Encoders.CDATA_ENCODER, out, input);
902     }
903 
904     /**
905      * Encodes for a Java string.  This method will use "\b", "\t", "\r", "\f",
906      * "\n", "\"", "\'", "\\", octal and unicode escapes.  Valid surrogate
907      * pairing is not checked.   The caller must provide the enclosing quotation
908      * characters.  This method is useful for when writing code generators and
909      * outputting debug messages.
910      *
911      * <pre>
912      *     out.println("public class Hello {");
913      *     out.println("    public static void main(String[] args) {");
914      *     out.println("        System.out.println(\"" + Encode.forJava(message) + "\");");
915      *     out.println("    }");
916      *     out.println("}");
917      * </pre>
918      *
919      * @param input the input to encode
920      * @return the input encoded for java strings.
921      */
forJava(String input)922     public static String forJava(String input) {
923         return encode(Encoders.JAVA_ENCODER, input);
924     }
925 
926     /**
927      * See {@link #forJava(String)} for description of encoding.  This
928      * version writes directly to a Writer without an intervening string.
929      *
930      * @param out where to write encoded output
931      * @param input the input string to encode
932      * @throws IOException if thrown by writer
933      */
forJava(Writer out, String input)934     public static void forJava(Writer out, String input)
935         throws IOException
936     {
937         encode(Encoders.JAVA_ENCODER, out, input);
938     }
939 
940     /**
941      * <p>Encodes for a JavaScript string.  It is safe for use in HTML
942      * script attributes (such as {@code onclick}), script
943      * blocks, JSON files, and JavaScript source.  The caller MUST
944      * provide the surrounding quotation characters for the string.
945      * Since this performs additional encoding so it can work in all
946      * of the JavaScript contexts listed, it may be slightly less
947      * efficient than using one of the methods targeted to a specific
948      * JavaScript context ({@link #forJavaScriptAttribute(String)},
949      * {@link #forJavaScriptBlock}, {@link #forJavaScriptSource}).
950      * Unless you are interested in saving a few bytes of output or
951      * are writing a framework on top of this library, it is recommend
952      * that you use this method over the others.</p>
953      *
954      * <b>Example JSP Usage:</b>
955      * <pre>
956      *    &lt;button onclick="alert('&lt;%=Encode.forJavaScript(data)%&gt;');"&gt;
957      *    &lt;script type="text/javascript"&gt;
958      *        var data = "&lt;%=Encode.forJavaScript(data)%&gt;";
959      *    &lt;/script&gt;
960      * </pre>
961      *
962      * <table cellspacing="1" class="memberSummary" cellpadding="1" border="0">
963      *   <caption><b>Encoding Description</b></caption>
964      *   <thead>
965      *     <tr>
966      *       <th align="left" colspan="2" class="colFirst">Input Character</th>
967      *       <th align="left" class="colLast">Encoded Result</th>
968      *       <th align="left" class="colLast">Notes</th>
969      *     </tr>
970      *   </thead>
971      *   <tbody>
972      *     <tr class="altColor">
973      *       <td class="colFirst">U+0008</td><td><i>BS</i></td>
974      *       <td class="colLast"><code>\b</code></td>
975      *       <td class="colLast">Backspace character</td>
976      *     </tr>
977      *     <tr class="rowColor">
978      *       <td class="colFirst">U+0009</td><td><i>HT</i></td>
979      *       <td class="colLast"><code>\t</code></td>
980      *       <td class="colLast">Horizontal tab character</td>
981      *     </tr>
982      *     <tr class="altColor">
983      *       <td class="colFirst">U+000A</td><td><i>LF</i></td>
984      *       <td class="colLast"><code>\n</code></td>
985      *       <td class="colLast">Line feed character</td>
986      *     </tr>
987      *     <tr class="rowColor">
988      *       <td class="colFirst">U+000C</td><td><i>FF</i></td>
989      *       <td class="colLast"><code>\f</code></td>
990      *       <td class="colLast">Form feed character</td>
991      *     </tr>
992      *     <tr class="altColor">
993      *       <td class="colFirst">U+000D</td><td><i>CR</i></td>
994      *       <td class="colLast"><code>\r</code></td>
995      *       <td class="colLast">Carriage return character</td>
996      *     </tr>
997      *     <tr class="rowColor">
998      *       <td class="colFirst">U+0022</td><td><code>"</code></td>
999      *       <td class="colLast"><code>\x22</code></td>
1000      *       <td class="colLast">The encoding <code>\"</code> is not used here because
1001      *       it is not safe for use in HTML attributes.  (In HTML
1002      *       attributes, it would also be correct to use
1003      *       "\&amp;quot;".)</td>
1004      *     </tr>
1005      *     <tr class="altColor">
1006      *       <td class="colFirst">U+0026</td><td><code>&amp;</code></td>
1007      *       <td class="colLast"><code>\x26</code></td>
1008      *       <td class="colLast">Ampersand character</td>
1009      *     </tr>
1010      *     <tr class="rowColor">
1011      *       <td class="colFirst">U+0027</td><td><code>'</code></td>
1012      *       <td class="colLast"><code>\x27</code></td>
1013      *       <td class="colLast">The encoding <code>\'</code> is not used here because
1014      *       it is not safe for use in HTML attributes.  (In HTML
1015      *       attributes, it would also be correct to use
1016      *       "\&amp;#39;".)</td>
1017      *     </tr>
1018      *     <tr class="altColor">
1019      *       <td class="colFirst">U+002F</td><td><code>/</code></td>
1020      *       <td class="colLast"><code>\/</code></td>
1021      *       <td class="colLast">This encoding is used to avoid an input sequence
1022      *       "&lt;/" from prematurely terminating a &lt;/script&gt;
1023      *       block.</td>
1024      *     </tr>
1025      *     <tr class="rowColor">
1026      *       <td class="colFirst">U+005C</td><td><code>\</code></td>
1027      *       <td class="colLast"><code>\\</code></td>
1028      *       <td class="colLast"></td>
1029      *     </tr>
1030      *     <tr class="altColor">
1031      *       <td class="colFirst" colspan="2">U+0000&nbsp;to&nbsp;U+001F</td>
1032      *       <td class="colLast"><code>\x##</code></td>
1033      *       <td class="colLast">Hexadecimal encoding is used for characters in this
1034      *       range that were not already mentioned in above.</td>
1035      *     </tr>
1036      *   </tbody>
1037      * </table>
1038      *
1039      * @param input the input string to encode
1040      * @return the input encoded for JavaScript
1041      * @see #forJavaScriptAttribute(String)
1042      * @see #forJavaScriptBlock(String)
1043      */
forJavaScript(String input)1044     public static String forJavaScript(String input) {
1045         return encode(Encoders.JAVASCRIPT_ENCODER, input);
1046     }
1047 
1048     /**
1049      * See {@link #forJavaScript(String)} for description of encoding.  This
1050      * version writes directly to a Writer without an intervening string.
1051      *
1052      * @param out where to write encoded output
1053      * @param input the input string to encode
1054      * @throws IOException if thrown by writer
1055      */
forJavaScript(Writer out, String input)1056     public static void forJavaScript(Writer out, String input)
1057         throws IOException
1058     {
1059         encode(Encoders.JAVASCRIPT_ENCODER, out, input);
1060     }
1061 
1062     /**
1063      * <p>This method encodes for JavaScript strings contained within
1064      * HTML script attributes (such as {@code onclick}).  It is
1065      * NOT safe for use in script blocks.  The caller MUST provide the
1066      * surrounding quotation characters.  This method performs the
1067      * same encode as {@link #forJavaScript(String)} with the
1068      * exception that <code>/</code> is not escaped.</p>
1069      *
1070      * <p><strong>Unless you are interested in saving a few bytes of
1071      * output or are writing a framework on top of this library, it is
1072      * recommend that you use {@link #forJavaScript(String)} over this
1073      * method.</strong></p>
1074      *
1075      * <b>Example JSP Usage:</b>
1076      * <pre>
1077      *    &lt;button onclick="alert('&lt;%=Encode.forJavaScriptAttribute(data)%&gt;');"&gt;
1078      * </pre>
1079      *
1080      * @param input the input string to encode
1081      * @return the input encoded for JavaScript
1082      * @see #forJavaScript(String)
1083      * @see #forJavaScriptBlock(String)
1084      */
forJavaScriptAttribute(String input)1085     public static String forJavaScriptAttribute(String input) {
1086         return encode(Encoders.JAVASCRIPT_ATTRIBUTE_ENCODER, input);
1087     }
1088 
1089     /**
1090      * See {@link #forJavaScriptAttribute(String)} for description of encoding.  This
1091      * version writes directly to a Writer without an intervening string.
1092      *
1093      * @param out where to write encoded output
1094      * @param input the input string to encode
1095      * @throws IOException if thrown by writer
1096      */
forJavaScriptAttribute(Writer out, String input)1097     public static void forJavaScriptAttribute(Writer out, String input)
1098         throws IOException
1099     {
1100         encode(Encoders.JAVASCRIPT_ATTRIBUTE_ENCODER, out, input);
1101     }
1102 
1103     /**
1104      * <p>This method encodes for JavaScript strings contained within
1105      * HTML script blocks.  It is NOT safe for use in script
1106      * attributes (such as <code>onclick</code>).  The caller must
1107      * provide the surrounding quotation characters.  This method
1108      * performs the same encode as {@link #forJavaScript(String)} with
1109      * the exception that <code>"</code> and <code>'</code> are
1110      * encoded as <code>\"</code> and <code>\'</code>
1111      * respectively.</p>
1112      *
1113      * <p><strong>Unless you are interested in saving a few bytes of
1114      * output or are writing a framework on top of this library, it is
1115      * recommend that you use {@link #forJavaScript(String)} over this
1116      * method.</strong></p>
1117      *
1118      * <b>Example JSP Usage:</b>
1119      * <pre>
1120      *    &lt;script type="text/javascript"&gt;
1121      *        var data = "&lt;%=Encode.forJavaScriptBlock(data)%&gt;";
1122      *    &lt;/script&gt;
1123      * </pre>
1124      *
1125      * @param input the input string to encode
1126      * @return the input encoded for JavaScript
1127      * @see #forJavaScript(String)
1128      * @see #forJavaScriptAttribute(String)
1129      */
forJavaScriptBlock(String input)1130     public static String forJavaScriptBlock(String input) {
1131         return encode(Encoders.JAVASCRIPT_BLOCK_ENCODER, input);
1132     }
1133 
1134     /**
1135      * See {@link #forJavaScriptBlock(String)} for description of encoding.  This
1136      * version writes directly to a Writer without an intervening string.
1137      *
1138      * @param out where to write encoded output
1139      * @param input the input string to encode
1140      * @throws IOException if thrown by writer
1141      */
forJavaScriptBlock(Writer out, String input)1142     public static void forJavaScriptBlock(Writer out, String input)
1143         throws IOException
1144     {
1145         encode(Encoders.JAVASCRIPT_BLOCK_ENCODER, out, input);
1146     }
1147 
1148     /**
1149      * <p>This method encodes for JavaScript strings contained within
1150      * a JavaScript or JSON file.  <strong>This method is NOT safe for
1151      * use in ANY context embedded in HTML.</strong> The caller must
1152      * provide the surrounding quotation characters.  This method
1153      * performs the same encode as {@link #forJavaScript(String)} with
1154      * the exception that <code>/</code> and <code>&amp;</code> are not
1155      * escaped and <code>"</code> and <code>'</code> are encoded as
1156      * <code>\"</code> and <code>\'</code> respectively.</p>
1157      *
1158      * <p><strong>Unless you are interested in saving a few bytes of
1159      * output or are writing a framework on top of this library, it is
1160      * recommend that you use {@link #forJavaScript(String)} over this
1161      * method.</strong></p>
1162      *
1163      * <b>Example JSP Usage:</b>
1164      * This example is serving up JavaScript source directly:
1165      * <pre>
1166      *    &lt;%@page contentType="text/javascript; charset=UTF-8"%&gt;
1167      *    var data = "&lt;%=Encode.forJavaScriptSource(data)%&gt;";
1168      * </pre>
1169      *
1170      * This example is serving up JSON data (users of this use-case
1171      * are encouraged to read up on "JSON Hijacking"):
1172      * <pre>
1173      *    &lt;%@page contentType="application/json; charset=UTF-8"%&gt;
1174      *    &lt;% myapp.jsonHijackingPreventionMeasure(); %&gt;
1175      *    {"data":"&lt;%=Encode.forJavaScriptSource(data)%&gt;"}
1176      * </pre>
1177      *
1178      * @param input the input string to encode
1179      * @return the input encoded for JavaScript
1180      * @see #forJavaScript(String)
1181      * @see #forJavaScriptAttribute(String)
1182      * @see #forJavaScriptBlock(String)
1183      */
forJavaScriptSource(String input)1184     public static String forJavaScriptSource(String input) {
1185         return encode(Encoders.JAVASCRIPT_SOURCE_ENCODER, input);
1186     }
1187 
1188     /**
1189      * See {@link #forJavaScriptSource(String)} for description of encoding.  This
1190      * version writes directly to a Writer without an intervening string.
1191      *
1192      * @param out where to write encoded output
1193      * @param input the input string to encode
1194      * @throws IOException if thrown by writer
1195      */
forJavaScriptSource(Writer out, String input)1196     public static void forJavaScriptSource(Writer out, String input)
1197         throws IOException
1198     {
1199         encode(Encoders.JAVASCRIPT_SOURCE_ENCODER, out, input);
1200     }
1201 
1202     // Additional?
1203     // MySQL
1204     // PostreSQL
1205     // Oracle
1206     // ...
1207 
1208     /**
1209      * Core encoding loop shared by public methods.  It first uses the
1210      * encoder to scan the input for characters that need encoding.  If
1211      * no characters require encoding, the input string is returned.
1212      * Otherwise a buffer is used to encode the remainder
1213      * of the input.
1214      *
1215      * @param encoder the encoder to use
1216      * @param str the string to encode
1217      * @return the input string encoded with the provided encoder.
1218      */
encode(Encoder encoder, String str)1219     static String encode(Encoder encoder, String str) {
1220         if (str == null) {
1221             // consistent with String.valueOf(...) use "null" for null.
1222             str = "null";
1223         }
1224 
1225         // quick pass--see if we need to actually encode anything, if not
1226         // return the value unchanged.
1227         final int n = str.length();
1228         int j = encoder.firstEncodedOffset(str, 0, n);
1229 
1230         if (j == n) {
1231             return str;
1232         }
1233 
1234         // otherwise, we need to encode.  We use a buffer to avoid
1235         // excessive memory allocation for these calls.  Note: this means that
1236         // an encoder implementation must NEVER call this method internally.
1237         return new Buffer().encode(encoder, str, j);
1238     }
1239 
1240     /**
1241      * Core encoding loop shared by public methods.  It first uses the
1242      * encoder to scan the input for characters that need encoding.  If no
1243      * characters require encoding, the input string is written directly to
1244      * the writer.  Otherwise a buffer is used to encode the
1245      * remainder of the input to the buffers.  This version saves a wrapping
1246      * in an String.
1247      *
1248      * @param encoder the encoder to use
1249      * @param out the writer for the encoded output
1250      * @param str the string to encode
1251      * @throws IOException if thrown by the writer
1252      */
encode(Encoder encoder, Writer out, String str)1253     static void encode(Encoder encoder, Writer out, String str)
1254         throws IOException
1255     {
1256         if (str == null) {
1257             // consistent with String.valueOf(...) use "null" for null.
1258             str = "null";
1259         }
1260 
1261         // quick pass--see if we need to actually encode anything, if not
1262         // return the value unchanged.
1263         final int n = str.length();
1264         int j = encoder.firstEncodedOffset(str, 0, n);
1265 
1266         if (j == n) {
1267             out.write(str);
1268             return;
1269         }
1270 
1271         // otherwise, we need to encode.  We use a buffer to avoid
1272         // excessive memory allocation for these calls.  Note: this means that
1273         // an encoder implementation must NEVER call this method internally.
1274         new Buffer().encode(encoder, out, str, j);
1275     }
1276 
1277     /**
1278      * A buffer used for encoding.
1279      */
1280     static class Buffer {
1281         /**
1282          * Input buffer size, used to extract a copy of the input
1283          * from a string and then send to the encoder.
1284          */
1285         static final int INPUT_BUFFER_SIZE = 1024;
1286         /**
1287          * Output buffer size used to store the encoded output before
1288          * wrapping in a string.
1289          */
1290         static final int OUTPUT_BUFFER_SIZE = INPUT_BUFFER_SIZE * 2;
1291 
1292         /**
1293          * The input buffer.  A heap-allocated, array-backed buffer of
1294          * INPUT_BUFFER_SIZE used for holding the characters to encode.
1295          */
1296         final CharBuffer _input = CharBuffer.allocate(INPUT_BUFFER_SIZE);
1297         /**
1298          * The output buffer.  A heap-allocated, array-backed buffer of
1299          * OUTPUT_BUFFER_SIZE used for holding the encoded output.
1300          */
1301         final CharBuffer _output = CharBuffer.allocate(OUTPUT_BUFFER_SIZE);
1302 
1303         /**
1304          * The core String encoding routine of this class.  It uses the input
1305          * and output buffers to allow the encoders to work in reuse arrays.
1306          * When the input and/or output exceeds the capacity of the reused
1307          * arrays, temporary ones are allocated and then discarded after
1308          * the encode is done.
1309          *
1310          * @param encoder the encoder to use
1311          * @param str the string to encode
1312          * @param j the offset in {@code str} to start encoding
1313          * @return the encoded result
1314          */
encode(Encoder encoder, String str, int j)1315         String encode(Encoder encoder, String str, int j) {
1316             final int n = str.length();
1317             final int remaining = n - j;
1318 
1319             if (remaining <= INPUT_BUFFER_SIZE && j <= OUTPUT_BUFFER_SIZE) {
1320                 // the remaining input to encode fits completely in the pre-
1321                 // allocated buffer.
1322                 str.getChars(0, j, _output.array(), 0);
1323                 str.getChars(j, n, _input.array(), 0);
1324 
1325                 _input.limit(remaining).position(0);
1326                 _output.clear().position(j);
1327 
1328                 CoderResult cr = encoder.encodeArrays(_input, _output, true);
1329                 if (cr.isUnderflow()) {
1330                     return new String(_output.array(), 0, _output.position());
1331                 }
1332 
1333                 // else, it's an overflow, we need to use a new output buffer
1334                 // we'll allocate this buffer to be the exact size of the worst
1335                 // case, guaranteeing a second overflow would not be possible.
1336                 CharBuffer tmp = CharBuffer.allocate(_output.position()
1337                             + encoder.maxEncodedLength(_input.remaining()));
1338 
1339                 // copy over everything that has been encoded so far
1340                 tmp.put(_output.array(), 0, _output.position());
1341 
1342                 cr = encoder.encodeArrays(_input, tmp, true);
1343                 if (cr.isOverflow()) {
1344                     throw new AssertionError("unexpected result from encoder");
1345                 }
1346 
1347                 return new String(tmp.array(), 0, tmp.position());
1348             } else {
1349                 // the input it too large for our pre-allocated buffers
1350                 // we'll use a temporary direct heap allocation
1351                 final int m = j + encoder.maxEncodedLength(remaining);
1352                 CharBuffer buffer = CharBuffer.allocate(m);
1353                 str.getChars(0, j, buffer.array(), 0);
1354                 str.getChars(j, n, buffer.array(), m - remaining);
1355 
1356                 CharBuffer input = buffer.duplicate();
1357                 input.limit(m).position(m-remaining);
1358                 buffer.position(j);
1359 
1360                 CoderResult cr = encoder.encodeArrays(input, buffer, true);
1361 
1362                 if (cr.isOverflow()) {
1363                     throw new AssertionError("unexpected result from encoder");
1364                 }
1365 
1366                 return new String(buffer.array(), 0, buffer.position());
1367             }
1368         }
1369 
1370         /**
1371          * The core Writer encoding routing of this class.  It uses the
1372          * input and output buffers to allow the encoders to reuse arrays.
1373          * Unlike the string version, this method will never allocate more
1374          * memory, instead encoding is done in batches and flushed to the
1375          * writer in batches as large as possible.
1376          *
1377          * @param encoder the encoder to use
1378          * @param out where to write the encoded output
1379          * @param str the string to encode
1380          * @param j the position in the string at which the first character
1381          * needs encoding.
1382          * @throws IOException if thrown by the writer.
1383          */
encode(Encoder encoder, Writer out, String str, int j)1384         void encode(Encoder encoder, Writer out, String str, int j)
1385             throws IOException
1386         {
1387             out.write(str, 0, j);
1388 
1389             final int n = str.length();
1390 
1391             _input.clear();
1392             _output.clear();
1393 
1394             final char[] inputArray = _input.array();
1395             final char[] outputArray = _output.array();
1396 
1397             for (;;) {
1398                 final int remainingInput = n - j;
1399                 final int startPosition = _input.position();
1400                 final int batchSize = Math.min(remainingInput, _input.remaining());
1401                 str.getChars(j, j+batchSize, inputArray, startPosition);
1402 
1403                 _input.limit(startPosition + batchSize);
1404 
1405 
1406                 for (;;) {
1407                     CoderResult cr = encoder.encodeArrays(
1408                         _input, _output, batchSize == remainingInput);
1409 
1410                     if (cr.isUnderflow()) {
1411                         // get next input batch
1412                         break;
1413                     }
1414 
1415                     // else, output buffer full, flush and continue.
1416                     out.write(outputArray, 0, _output.position());
1417                     _output.clear();
1418                 }
1419 
1420                 j += _input.position() - startPosition;
1421 
1422                 if (j == n) {
1423                     // done.  flush remaining output buffer and return
1424                     out.write(outputArray, 0, _output.position());
1425                     return;
1426                 }
1427 
1428                 _input.compact();
1429             }
1430         }
1431     }
1432 }
1433