• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 Jeff Ichnowski
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 //     * Redistributions of source code must retain the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer.
11 //
12 //     * Redistributions in binary form must reproduce the above
13 //       copyright notice, this list of conditions and the following
14 //       disclaimer in the documentation and/or other materials
15 //       provided with the distribution.
16 //
17 //     * Neither the name of the OWASP nor the names of its
18 //       contributors may be used to endorse or promote products
19 //       derived from this software without specific prior written
20 //       permission.
21 //
22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33 // OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 package org.owasp.encoder;
36 
37 import java.io.IOException;
38 import java.io.Writer;
39 import java.nio.CharBuffer;
40 import java.nio.charset.CoderResult;
41 
42 /**
43  * Encode -- fluent interface for contextual encoding.  Example usage in a JSP:
44  *
45  * <pre>
46  *     &lt;input value="&lt;%=Encode.forHtml(value)%&gt;" /&gt;
47  * </pre>
48  *
49  * <p>There are two versions of each contextual encoding method.  The first
50  * takes a {@code String} argument and returns the encoded version as a
51  * {@code String}.  The second version writes the encoded version directly
52  * to a {@code Writer}.</p>
53  *
54  * <p>Please make sure to read and understand the context that the method encodes
55  * for.  Encoding for the incorrect context will likely lead to exposing a
56  * cross-site scripting vulnerability. Those new to XSS mitigation may find it
57  * useful to read the
58  * <a href="https://cheatsheetseries.owasp.org/cheatsheets/Cross_Site_Scripting_Prevention_Cheat_Sheet.html">
59  * Cross Site Scripting Prevention Cheat Sheet</a> that is part of the OWASP Cheat Sheet series for background
60  * material.
61  * </p>
62  *
63  * @author Jeff Ichnowski
64  */
65 public final class Encode {
66     /** No instances. */
Encode()67     private Encode() {}
68 
69     /**
70      * <p>Encodes for (X)HTML text content and text attributes.  Since
71      * this method encodes for both contexts, it may be slightly less
72      * efficient to use this method over the methods targeted towards
73      * the specific contexts ({@link #forHtmlAttribute(String)} and
74      * {@link #forHtmlContent(String)}).  In general this method should
75      * be preferred unless you are really concerned with saving a few
76      * bytes or are writing a framework that utilizes this
77      * package.</p>
78      *
79      * <b>Example JSP Usage</b>
80      * <pre>
81      *     &lt;div&gt;&lt;%=Encode.forHtml(unsafeData)%&gt;&lt;/div&gt;
82      *
83      *     &lt;input value="&lt;%=Encode.forHtml(unsafeData)%&gt;" /&gt;
84      * </pre>
85      *
86      * <table border="0" class="memberSummary" summary="Shows the input and results of encoding">
87      *   <caption><b>Encoding&nbsp;Table</b></caption>
88      *   <thead>
89      *     <tr>
90      *       <th align="left" class="colFirst">Input</th>
91      *       <th align="left" class="colLast">Result</th>
92      *     </tr>
93      *   </thead>
94      *   <tbody>
95      *     <tr class="altColor">
96      *       <td class="colFirst">{@code &}</td>
97      *       <td class="colLast">{@code &amp;}</td>
98      *     </tr>
99      *     <tr class="rowColor">
100      *       <td class="colFirst">{@code <}</td>
101      *       <td class="colLast">{@code &lt;}</td>
102      *     </tr>
103      *     <tr class="altColor">
104      *       <td class="colFirst">{@code >}</td>
105      *       <td class="colLast">{@code &gt;}</td>
106      *     </tr>
107      *     <tr class="rowColor">
108      *       <td class="colFirst">{@code "}</td>
109      *       <td class="colLast">{@code &#34;}</td>
110      *     </tr>
111      *     <tr class="altColor">
112      *       <td class="colFirst">{@code '}</td>
113      *       <td class="colLast">{@code &#39;}</td>
114      *     </tr>
115      *   </tbody>
116      * </table>
117      *
118      * <p><b>Additional Notes</b></p>
119      * <ul>
120      * <li>The encoding of the greater-than sign ({@code >}) is not
121      * strictly required, but is included for maximum
122      * compatibility.</li>
123      *
124      * <li>Numeric encoding is used for double-quote character ({@code
125      * "}) as it shorter than the also valid {@code &quot;}.</li>
126      *
127      * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab
128      * (U+09) and space (U+20) are valid in quoted attributes and in
129      * block in an unescaped form.</li>
130      *
131      * <li>Surrogate pairs are passed through only if valid.</li>
132      *
133      * <li>Characters that are not <a
134      * href="http://www.w3.org/TR/REC-xml/#charsets">valid according
135      * to the XML specification</a> are replaced by a space character
136      * as they could lead to parsing errors.  In particular only {@code #x9
137      * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
138      * [#x10000-#x10FFFF]} are considered valid.</li>
139      * </ul>
140      *
141      * @param input the data to encode
142      * @return the data encoded for html.
143      */
forHtml(String input)144     public static String forHtml(String input) {
145         return forXml(input);
146     }
147 
148     /**
149      * See {@link #forHtml(String)} for description of encoding.  This
150      * version writes directly to a Writer without an intervening string.
151      *
152      * @param out where to write encoded output
153      * @param input the input string to encode
154      * @throws IOException if thrown by writer
155      */
forHtml(Writer out, String input)156     public static void forHtml(Writer out, String input) throws IOException {
157         forXml(out, input);
158     }
159 
160     /**
161      * <p>This method encodes for HTML text content.  It does not escape
162      * quotation characters and is thus unsafe for use with
163      * HTML attributes.  Use either {@link #forHtml(String)} or {@link #forHtmlAttribute(String)} for those
164      * methods.</p>
165      *
166      * <b>Example JSP Usage</b>
167      * <pre>
168      *     &lt;div&gt;&lt;%=Encode.forHtmlContent(unsafeData)%&gt;&lt;/div&gt;
169      * </pre>
170      * <table border="0" class="memberSummary" summary="Shows the input and results of encoding">
171      *   <caption><b>Encoding Table</b></caption>
172      *   <thead>
173      *     <tr>
174      *       <th align="left" class="colFirst">Input</th>
175      *       <th align="left" class="colLast">Result</th>
176      *     </tr>
177      *   </thead>
178      *   <tbody>
179      *     <tr class="altColor">
180      *       <td class="colFirst">{@code &}</td>
181      *       <td class="colLast">{@code &amp;}</td>
182      *     </tr>
183      *     <tr class="rowColor">
184      *       <td class="colFirst">{@code <}</td>
185      *       <td class="colLast">{@code &lt;}</td>
186      *     </tr>
187      *     <tr class="altColor">
188      *       <td class="colFirst">{@code >}</td>
189      *       <td class="colLast">{@code &gt;}</td>
190      *     </tr>
191      *   </tbody>
192      * </table>
193      *
194      * <p><b>Additional Notes</b></p>
195      * <ul>
196      * <li>Single-quote character ({@code '}) and double-quote
197      * character ({@code "}) do not require encoding in HTML
198      * blocks, unlike other HTML contexts.</li>
199      *
200      * <li>The encoding of the greater-than sign ({@code >}) is not
201      * strictly required, but is included for maximum
202      * compatibility.</li>
203      *
204      * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab
205      * (U+09) and space (U+20) are valid in quoted attributes and in
206      * block in an unescaped form.</li>
207      *
208      * <li>Surrogate pairs are passed through only if valid.</li>
209      *
210      * <li>Characters that are not <a
211      * href="http://www.w3.org/TR/REC-xml/#charsets">valid according
212      * to the XML specification</a> are replaced by a space character
213      * as they could lead to parsing errors.  In particular only {@code #x9
214      * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
215      * [#x10000-#x10FFFF]} are considered valid.</li>
216      * </ul>
217      *
218      * @param input the input to encode
219      * @return the encoded result
220      */
forHtmlContent(String input)221     public static String forHtmlContent(String input) {
222         return forXmlContent(input);
223     }
224 
225     /**
226      * See {@link #forHtmlContent(String)} for description of encoding.  This
227      * version writes directly to a Writer without an intervening string.
228      *
229      * @param out where to write encoded output
230      * @param input the input string to encode
231      * @throws IOException if thrown by writer
232      */
forHtmlContent(Writer out, String input)233     public static void forHtmlContent(Writer out, String input)
234         throws IOException
235     {
236         forXmlContent(out, input);
237     }
238 
239     /**
240      * <p>This method encodes for HTML text attributes. Do not use for JavaScript event attributes or for attributes
241      * that are interpreted as a URL. Instead use {@link #forJavaScript(String)} and {@link #forUriComponent(String)}
242      * respectively for those.</p>
243      *
244      * <b>Example JSP Usage</b>
245      * <pre>
246      *     &lt;div&gt;&lt;%=Encode.forHtmlAttribute(unsafeData)%&gt;&lt;/div&gt;
247      * </pre>
248      *
249      * <table border="0" class="memberSummary" summary="Shows the input and results of encoding">
250      *   <caption><b>Encoding Table</b></caption>
251      *   <thead>
252      *     <tr>
253      *       <th align="left" class="colFirst">Input</th>
254      *       <th align="left" class="colLast">Result</th>
255      *     </tr>
256      *   </thead>
257      *   <tbody>
258      *     <tr class="altColor">
259      *       <td class="colFirst">{@code &}</td>
260      *       <td class="colLast">{@code &amp;}</td>
261      *     </tr>
262      *     <tr class="rowColor">
263      *       <td class="colFirst">{@code <}</td>
264      *       <td class="colLast">{@code &lt;}</td>
265      *     </tr>
266      *     <tr class="altColor">
267      *       <td class="colFirst">{@code "}</td>
268      *       <td class="colLast">{@code &#34;}</td>
269      *     </tr>
270      *     <tr class="rowColor">
271      *       <td class="colFirst">{@code '}</td>
272      *       <td class="colLast">{@code &#39;}</td>
273      *     </tr>
274      *   </tbody>
275      * </table>
276      *
277      * <p><b>Additional Notes</b></p>
278      * <ul>
279      * <li>Both the single-quote character ({@code '}) and the
280      * double-quote character ({@code "}) are encoded so this is safe
281      * for HTML attributes with either enclosing character.</li>
282      *
283      * <li>The encoding of the greater-than sign ({@code >}) is not
284      * required for attributes.</li>
285      *
286      * <li>Numeric encoding is used for double-quote character ({@code
287      * "}) as it shorter than the also valid {@code &quot;}.</li>
288      *
289      * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab
290      * (U+09) and space (U+20) are valid in quoted attributes and in
291      * block in an unescaped form.</li>
292      *
293      * <li>Surrogate pairs are passed through only if valid.</li>
294      *
295      * <li>Characters that are not <a
296      * href="http://www.w3.org/TR/REC-xml/#charsets">valid according
297      * to the XML specification</a> are replaced by a space character
298      * as they could lead to parsing errors.  In particular only {@code #x9
299      * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
300      * [#x10000-#x10FFFF]} are considered valid.</li>
301      * </ul>
302      *
303      * @param input the input to encode
304      * @return the encoded result
305      */
forHtmlAttribute(String input)306     public static String forHtmlAttribute(String input) {
307         return forXmlAttribute(input);
308     }
309 
310     /**
311      * See {@link #forHtmlAttribute(String)} for description of encoding.  This
312      * version writes directly to a Writer without an intervening string.
313      *
314      * @param out where to write encoded output
315      * @param input the input string to encode
316      * @throws IOException if thrown by writer
317      */
forHtmlAttribute(Writer out, String input)318     public static void forHtmlAttribute(Writer out, String input)
319         throws IOException
320     {
321         forXmlAttribute(out, input);
322     }
323 
324 
325     /**
326      * <p>Encodes for unquoted HTML attribute values.  {@link
327      * #forHtml(String)} or {@link #forHtmlAttribute(String)} should
328      * usually be preferred over this method as quoted attributes are
329      * XHTML compliant.</p>
330      *
331      * <p>When using this method, the caller is not required to
332      * provide quotes around the attribute (since it is encoded for
333      * such context).  The caller should make sure that the attribute
334      * value does not abut unsafe characters--and thus should usually
335      * err on the side of including a space character after the
336      * value.</p>
337      *
338      * <p>Use of this method is discouraged as quoted attributes are
339      * generally more compatible and safer.  Also note, that no
340      * attempt has been made to optimize this encoding, though it is
341      * still probably faster than other encoding libraries.</p>
342      *
343      * <b>Example JSP Usage</b>
344      * <pre>
345      *     &lt;input value=&lt;%=Encode.forHtmlUnquotedAttribute(input)%&gt; &gt;
346      * </pre>
347      *
348      * <table border="0" class="memberSummary" summary="Shows the input and results of encoding">
349      *   <caption><b>Encoding Table</b></caption>
350      *   <thead>
351      *     <tr>
352      *       <th align="left" class="colFirst">Input</th>
353      *       <th align="left" class="colLast">Result</th>
354      *     </tr>
355      *   </thead>
356      *   <tbody>
357      *     <tr class="altColor">
358      *         <td class="colFirst">{@code U+0009} (horizontal tab)</td>
359      *         <td class="colLast">{@code &#9;}</td></tr>
360      *     <tr class="rowColor">
361      *         <td class="colFirst">{@code U+000A} (line feed)</td>
362      *         <td class="colLast">{@code &#10;}</td></tr>
363      *     <tr class="altColor">
364      *         <td class="colFirst">{@code U+000C} (form feed)</td>
365      *         <td class="colLast">{@code &#12;}</td></tr>
366      *     <tr class="rowColor">
367      *         <td class="colFirst">{@code U+000D} (carriage return)</td>
368      *         <td class="colLast">{@code &#13;}</td></tr>
369      *     <tr class="altColor">
370      *         <td class="colFirst">{@code U+0020} (space)</td>
371      *         <td class="colLast">{@code &#32;}</td></tr>
372      *     <tr class="rowColor">
373      *         <td class="colFirst">{@code &}</td>
374      *         <td class="colLast">{@code &amp;}</td></tr>
375      *     <tr class="altColor">
376      *         <td class="colFirst">{@code <}</td>
377      *         <td class="colLast">{@code &lt;}</td></tr>
378      *     <tr class="rowColor">
379      *         <td class="colFirst">{@code >}</td>
380      *         <td class="colLast">{@code &gt;}</td></tr>
381      *     <tr class="altColor">
382      *         <td class="colFirst">{@code "}</td>
383      *         <td class="colLast">{@code &#34;}</td></tr>
384      *     <tr class="rowColor">
385      *         <td class="colFirst">{@code '}</td>
386      *         <td class="colLast">{@code &#39;}</td></tr>
387      *     <tr class="altColor">
388      *         <td class="colFirst">{@code /}</td>
389      *         <td class="colLast">{@code &#47;}</td></tr>
390      *     <tr class="rowColor">
391      *         <td class="colFirst">{@code =}</td>
392      *         <td class="colLast">{@code &#61;}</td></tr>
393      *     <tr class="altColor">
394      *         <td class="colFirst">{@code `}</td>
395      *         <td class="colLast">{@code &#96;}</td></tr>
396      *     <tr class="rowColor">
397      *         <td class="colFirst">{@code U+0085} (next line)</td>
398      *         <td class="colLast">{@code &#133;}</td></tr>
399      *     <tr class="altColor">
400      *         <td class="colFirst">{@code U+2028} (line separator)</td>
401      *         <td class="colLast">{@code &#8232;}</td></tr>
402      *     <tr class="rowColor">
403      *         <td class="colFirst">{@code U+2029} (paragraph separator)</td>
404      *         <td class="colLast">{@code &#8233;}</td></tr>
405      *   </tbody>
406      * </table>
407      *
408      * <p><b>Additional Notes</b></p>
409      * <ul>
410      * <li>The following characters are <i>not</i> encoded:
411      * {@code 0-9, a-z, A-Z}, {@code !}, {@code
412      * #}, {@code $}, {@code %},
413      * {@code (}, {@code )}, {@code
414      * *}, {@code +}, {@code ,},
415      * {@code -}, {@code .}, {@code
416      * [}, {@code \}, {@code ]},
417      * {@code ^}, {@code _}, {@code
418      * }}.</li>
419      *
420      * <li>Surrogate pairs are passed through only if valid.  Invalid
421      * surrogate pairs are replaced by a hyphen (-).</li>
422      *
423      * <li>Characters in the C0 and C1 control blocks and not
424      * otherwise listed above are considered invalid and replaced by a
425      * hyphen (-) character.</li>
426      *
427      * <li>Unicode "non-characters" are replaced by hyphens (-).</li>
428      * </ul>
429      *
430      * @param input the attribute value to be encoded.
431      * @return the attribute value encoded for unquoted attribute
432      * context.
433      */
forHtmlUnquotedAttribute(String input)434     public static String forHtmlUnquotedAttribute(String input) {
435         return encode(Encoders.HTML_UNQUOTED_ATTRIBUTE_ENCODER, input);
436     }
437 
438     /**
439      * See {@link #forHtmlUnquotedAttribute(String)} for description of encoding.  This
440      * version writes directly to a Writer without an intervening string.
441      *
442      * @param out where to write encoded output
443      * @param input the input string to encode
444      * @throws IOException if thrown by writer
445      */
forHtmlUnquotedAttribute(Writer out, String input)446     public static void forHtmlUnquotedAttribute(Writer out, String input)
447         throws IOException
448     {
449         encode(Encoders.HTML_UNQUOTED_ATTRIBUTE_ENCODER, out, input);
450     }
451 
452 
453     // HTML comment encoding is not currently supported because
454     // of the number of vendor-specific sequences that would need
455     // to be handled (e.g. "<!--[if IE]-->"
456 
457 //    public static String forHtmlComment(String input) {
458 //        // only alphanumeric and space, everything else becomes a space
459 //
460 //        // HTML comment context needs to avoid browser extensions
461 //        // such as "<!--[if IE]-->"
462 //        throw new UnsupportedOperationException();
463 //    }
464 
465     /**
466      * Encodes for CSS strings.  The context must be surrounded by quotation
467      * characters.  It is safe for use in both style blocks and attributes in
468      * HTML.
469      *
470      * <b>Example JSP Usage</b>
471      * <pre>
472      *     &lt;div style="background: url('&lt;=Encode.forCssString(...)%&gt;');"&gt;
473      *
474      *     &lt;style type="text/css"&gt;
475      *         background: url('&lt;%=Encode.forCssString(...)%&gt;');
476      *     &lt;/style&gt;
477      * </pre>
478      *
479      * <b>Encoding  Notes</b>
480      * <ul>
481      *
482      * <li>The following characters are encoded using hexadecimal
483      * encodings: {@code U+0000} - {@code U+001f},
484      * {@code "},
485      * {@code '},
486      * {@code \},
487      * {@code <},
488      * {@code &},
489      * {@code /},
490      * {@code >},
491      * {@code U+007f},
492      * line separator ({@code U+2028}),
493      * paragraph separator ({@code U+2029}).</li>
494      *
495      * <li>Any character requiring encoding is encoded as {@code \xxx}
496      * where {@code xxx} is the shortest hexadecimal representation of
497      * its Unicode code point (after decoding surrogate pairs if
498      * necessary).  This encoding is never zero padded.  Thus, for
499      * example, the tab character is encoded as {@code \9}, not {@code
500      * \0009}.</li>
501      *
502      * <li>The encoder looks ahead 1 character in the input and
503      * appends a space to an encoding to avoid the next character
504      * becoming part of the hexadecimal encoded sequence.  Thus
505      * &ldquo;{@code '1}&rdquo; is encoded as &ldquo;{@code \27
506      * 1}&rdquo;, and not as &ldquo;{@code \271}&rdquo;.  If a space
507      * is not necessary, it is not included, thus &ldquo;{@code
508      * 'x}&rdquo; is encoded as &ldquo;{@code \27x}&rdquo;, and not as
509      * &ldquo;{@code \27 x}&rdquo;.</li>
510      *
511      * <li>Surrogate pairs are passed through only if valid.  Invalid
512      * surrogate pairs are replaced by an underscore (_).</li>
513      *
514      * <li>Unicode "non-characters" are replaced by underscores (_).</li>
515      *
516      * </ul>
517      *
518      * @param input the input to encode
519      * @return the encoded result
520      */
forCssString(String input)521     public static String forCssString(String input) {
522         // need to watch out for CSS expressions
523         return encode(Encoders.CSS_STRING_ENCODER, input);
524     }
525 
526     /**
527      * See {@link #forCssString(String)} for description of encoding.  This
528      * version writes directly to a Writer without an intervening string.
529      *
530      * @param out where to write encoded output
531      * @param input the input string to encode
532      * @throws IOException if thrown by writer
533      */
forCssString(Writer out, String input)534     public static void forCssString(Writer out, String input)
535         throws IOException
536     {
537         encode(Encoders.CSS_STRING_ENCODER, out, input);
538     }
539 
540     /**
541      * Encodes for CSS URL contexts.  The context must be surrounded by {@code "url("}
542      * and {@code ")"}.  It is safe for use in both style blocks and attributes in HTML.
543      * Note: this does not do any checking on the quality or safety of the URL
544      * itself.  The caller should insure that the URL is safe for embedding
545      * (e.g. input validation) by other means.
546      *
547      * <b>Example JSP Usage</b>
548      * <pre>
549      *     &lt;div style="background:url(&lt;=Encode.forCssUrl(...)%&gt;);"&gt;
550      *
551      *     &lt;style type="text/css"&gt;
552      *         background: url('&lt;%=Encode.forCssUrl(...)%&gt;');
553      *     &lt;/style&gt;
554      * </pre>
555      * <b>Encoding  Notes</b>
556      * <ul>
557      *
558      * <li>The following characters are encoded using hexadecimal
559      * encodings: {@code U+0000} - {@code U+001f},
560      * {@code "},
561      * {@code '},
562      * {@code \},
563      * {@code <},
564      * {@code &},
565      * {@code /},
566      * {@code >},
567      * {@code U+007f},
568      * line separator ({@code U+2028}),
569      * paragraph separator ({@code U+2029}).</li>
570      *
571      * <li>Any character requiring encoding is encoded as {@code \xxx}
572      * where {@code xxx} is the shortest hexadecimal representation of
573      * its Unicode code point (after decoding surrogate pairs if
574      * necessary).  This encoding is never zero padded.  Thus, for
575      * example, the tab character is encoded as {@code \9}, not {@code
576      * \0009}.</li>
577      *
578      * <li>The encoder looks ahead 1 character in the input and
579      * appends a space to an encoding to avoid the next character
580      * becoming part of the hexadecimal encoded sequence.  Thus
581      * &ldquo;{@code '1}&rdquo; is encoded as &ldquo;{@code \27
582      * 1}&rdquo;, and not as &ldquo;{@code \271}&rdquo;.  If a space
583      * is not necessary, it is not included, thus &ldquo;{@code
584      * 'x}&rdquo; is encoded as &ldquo;{@code \27x}&rdquo;, and not as
585      * &ldquo;{@code \27 x}&rdquo;.</li>
586      *
587      * <li>Surrogate pairs are passed through only if valid.  Invalid
588      * surrogate pairs are replaced by an underscore (_).</li>
589      *
590      * <li>Unicode "non-characters" are replaced by underscores (_).</li>
591      *
592      * </ul>
593      *
594      * @param input the input to encode
595      * @return the encoded result
596      */
forCssUrl(String input)597     public static String forCssUrl(String input) {
598         return encode(Encoders.CSS_URL_ENCODER, input);
599     }
600 
601     /**
602      * See {@link #forCssUrl(String)} for description of encoding.  This
603      * version writes directly to a Writer without an intervening string.
604      *
605      * @param out where to write encoded output
606      * @param input the input string to encode
607      * @throws IOException if thrown by writer
608      */
forCssUrl(Writer out, String input)609     public static void forCssUrl(Writer out, String input)
610         throws IOException
611     {
612         encode(Encoders.CSS_URL_ENCODER, out, input);
613     }
614 
615     /**
616      * <p>Performs percent-encoding of a URL according to RFC 3986.  The provided
617      * URL is assumed to a valid URL.  This method does not do any checking on
618      * the quality or safety of the URL itself.  In many applications it may
619      * be better to use {@link java.net.URI} instead.  Note: this is a
620      * particularly dangerous context to put untrusted content in, as for
621      * example a "javascript:" URL provided by a malicious user would be
622      * "properly" escaped, and still execute.</p>
623      *
624      * <b>Encoding Table</b>
625      * <p>The following characters are <i>not</i> encoded:</p>
626      * <pre>
627      * U+20:   !   # $   &amp; ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ;   =   ?
628      * U+40: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [   ]   _
629      * U+60:   a b c d e f g h i j k l m n o p q r s t u v w x y z       ~
630      * </pre>
631      *
632      * <b>Encoding Notes</b>
633      * <ul>
634      *
635      *   <li>The single-quote character({@code '}) <b>is not encoded</b>.</li>
636      *
637      *   <li>This encoding is not intended to be used standalone.  The
638      *   output should be encoded to the target context.  For example:
639      *   {@code <a
640      *   href="<%=Encode.forHtmlAttribute(Encode.forUri(uri))%>">...</a>}.
641      *   (Note, the single-quote character ({@code '}) is not
642      *   encoded.)</li>
643      *
644      *   <li>URL encoding is an encoding for bytes, not unicode.  The
645      *   input string is thus first encoded as a sequence of UTF-8
646      *   byte.  The bytes are then encoded as {@code %xx} where {@code
647      *   xx} is the two-digit hexadecimal representation of the
648      *   byte. (The implementation does this as one step for
649      *   performance.)</li>
650      *
651      *   <li>Surrogate pairs are first decoded to a Unicode code point
652      *   before encoding as UTF-8.</li>
653      *
654      *   <li>Invalid characters (e.g. partial or invalid surrogate
655      *   pairs), are replaced with a hyphen ({@code -}) character.</li>
656      *
657      * </ul>
658      *
659      * @param input the input to encode
660      * @return the encoded result
661      */
forUri(String input)662     @Deprecated public static String forUri(String input) {
663         return encode(Encoders.URI_ENCODER, input);
664     }
665 
666     /**
667      * See {@link #forUri(String)} for description of encoding.  This
668      * version writes directly to a Writer without an intervening string.
669      *
670      * @param out where to write encoded output
671      * @param input the input string to encode
672      * @throws IOException if thrown by writer
673      *
674      * @deprecated  There is never a need to encode a complete URI with this form of encoding.
675      */
forUri(Writer out, String input)676     @Deprecated public static void forUri(Writer out, String input)
677         throws IOException
678     {
679         encode(Encoders.URI_ENCODER, out, input);
680     }
681 
682     /**
683      * Performs percent-encoding for a component of a URI, such as a query
684      * parameter name or value, path or query-string.  In particular this
685      * method insures that special characters in the component do not get
686      * interpreted as part of another component.
687      *
688      * <pre>
689      *     &lt;a href="http://www.owasp.org/&lt;%=Encode.forUriComponent(...)%&gt;?query#fragment"&gt;
690      *
691      *     &lt;a href="/search?value=&lt;%=Encode.forUriComponent(...)%&gt;&amp;order=1#top"&gt;
692      * </pre>
693      *
694      * <b>Encoding Table</b>
695      * <p>The following characters are <i>not</i> encoded:</p>
696      * <pre>
697      * U+20:                           - .   0 1 2 3 4 5 6 7 8 9
698      * U+40:   A B C D E F G H I J K L M N O P Q R S T U V W X Y Z         _
699      * U+60:   a b c d e f g h i j k l m n o p q r s t u v w x y z       ~
700      * </pre>
701      *
702      * <b>Encoding Notes</b>
703      * <ul>
704      *
705      *   <li>Unlike {@link #forUri(String)} this method is safe to be
706      *   used in most containing contexts, including: HTML/XML, CSS,
707      *   and JavaScript contexts.</li>
708      *
709      *   <li>URL encoding is an encoding for bytes, not unicode.  The
710      *   input string is thus first encoded as a sequence of UTF-8
711      *   byte.  The bytes are then encoded as {@code %xx} where {@code
712      *   xx} is the two-digit hexadecimal representation of the
713      *   byte. (The implementation does this as one step for
714      *   performance.)</li>
715      *
716      *   <li>Surrogate pairs are first decoded to a Unicode code point
717      *   before encoding as UTF-8.</li>
718      *
719      *   <li>Invalid characters (e.g. partial or invalid surrogate
720      *   pairs), are replaced with a hyphen ({@code -}) character.</li>
721      *
722      * </ul>
723      *
724      * @param input the input to encode
725      * @return the encoded result
726      */
forUriComponent(String input)727     public static String forUriComponent(String input) {
728         return encode(Encoders.URI_COMPONENT_ENCODER, input);
729     }
730 
731     /**
732      * See {@link #forUriComponent(String)} for description of encoding.  This
733      * version writes directly to a Writer without an intervening string.
734      *
735      * @param out where to write encoded output
736      * @param input the input string to encode
737      * @throws IOException if thrown by writer
738      */
forUriComponent(Writer out, String input)739     public static void forUriComponent(Writer out, String input)
740         throws IOException
741     {
742         encode(Encoders.URI_COMPONENT_ENCODER, out, input);
743     }
744 
745     /**
746      * Encoder for XML and XHTML.  See {@link #forHtml(String)} for a
747      * description of the encoding and context.
748      *
749      * @see #forHtml(String)
750      * @param input the input to encode
751      * @return the encoded result
752      */
forXml(String input)753     public static String forXml(String input) {
754         return encode(Encoders.XML_ENCODER, input);
755     }
756 
757     /**
758      * See {@link #forXml(String)} for description of encoding.  This
759      * version writes directly to a Writer without an intervening string.
760      *
761      * @param out where to write encoded output
762      * @param input the input string to encode
763      * @throws IOException if thrown by writer
764      */
forXml(Writer out, String input)765     public static void forXml(Writer out, String input)
766         throws IOException
767     {
768         encode(Encoders.XML_ENCODER, out, input);
769     }
770 
771     /**
772      * Encoder for XML and XHTML text content.  See {@link
773      * #forHtmlContent(String)} for description of encoding and
774      * context.
775      *
776      * @see #forHtmlContent(String)
777      * @param input the input to encode
778      * @return the encoded result
779      */
forXmlContent(String input)780     public static String forXmlContent(String input) {
781         return encode(Encoders.XML_CONTENT_ENCODER, input);
782     }
783 
784     /**
785      * See {@link #forXmlContent(String)} for description of encoding.  This
786      * version writes directly to a Writer without an intervening string.
787      *
788      * @param out where to write encoded output
789      * @param input the input string to encode
790      * @throws IOException if thrown by writer
791      */
forXmlContent(Writer out, String input)792     public static void forXmlContent(Writer out, String input)
793         throws IOException
794     {
795         encode(Encoders.XML_CONTENT_ENCODER, out, input);
796     }
797 
798     /**
799      * Encoder for XML and XHTML attribute content.  See {@link
800      * #forHtmlAttribute(String)} for description of encoding and
801      * context.
802      *
803      * @see #forHtmlAttribute(String)
804      * @param input the input to encode
805      * @return the encoded result
806      */
forXmlAttribute(String input)807     public static String forXmlAttribute(String input) {
808         return encode(Encoders.XML_ATTRIBUTE_ENCODER, input);
809     }
810 
811     /**
812      * See {@link #forXmlAttribute(String)} for description of encoding.  This
813      * version writes directly to a Writer without an intervening string.
814      *
815      * @param out where to write encoded output
816      * @param input the input string to encode
817      * @throws IOException if thrown by writer
818      */
forXmlAttribute(Writer out, String input)819     public static void forXmlAttribute(Writer out, String input)
820         throws IOException
821     {
822         encode(Encoders.XML_ATTRIBUTE_ENCODER, out, input);
823     }
824 
825     /**
826      * Encoder for XML comments.  <strong>NOT FOR USE WITH
827      * (X)HTML CONTEXTS.</strong>  (X)HTML comments may be interpreted by
828      * browsers as something other than a comment, typically in vendor
829      * specific extensions (e.g. {@code <--if[IE]-->}).
830      * For (X)HTML it is recommend that unsafe content never be included
831      * in a comment.
832      *
833      * <p>The caller must provide the comment start and end sequences.</p>
834      *
835      * <p>This method replaces all invalid XML characters with spaces,
836      * and replaces the "--" sequence (which is invalid in XML comments)
837      * with "-~" (hyphen-tilde).  <b>This encoding behavior may change
838      * in future releases.</b>  If the comments need to be decoded, the
839      * caller will need to come up with their own encode/decode system.</p>
840      *
841      * <pre>
842      *     out.println("&lt;?xml version='1.0'?&gt;");
843      *     out.println("&lt;data&gt;");
844      *     out.println("&lt;!-- "+Encode.forXmlComment(comment)+" --&gt;");
845      *     out.println("&lt;/data&gt;");
846      * </pre>
847      *
848      * @param input the input to encode
849      * @return the encoded result
850      */
forXmlComment(String input)851     public static String forXmlComment(String input) {
852         return encode(Encoders.XML_COMMENT_ENCODER, input);
853     }
854 
855     /**
856      * See {@link #forXmlComment(String)} for description of encoding.  This
857      * version writes directly to a Writer without an intervening string.
858      *
859      * @param out where to write encoded output
860      * @param input the input string to encode
861      * @throws IOException if thrown by writer
862      */
forXmlComment(Writer out, String input)863     public static void forXmlComment(Writer out, String input)
864         throws IOException
865     {
866         encode(Encoders.XML_COMMENT_ENCODER, out, input);
867     }
868 
869     /**
870      * Encodes data for an XML CDATA section.  On the chance that the input
871      * contains a terminating {@code "]]>"}, it will be replaced by
872      * {@code "]]>]]<![CDATA[>"}.
873      * As with all XML contexts, characters that are invalid according to the
874      * XML specification will be replaced by a space character.   Caller must
875      * provide the CDATA section boundaries.
876      *
877      * <pre>
878      *     &lt;xml-data&gt;&lt;![CDATA[&lt;%=Encode.forCDATA(...)%&gt;]]&gt;&lt;/xml-data&gt;
879      * </pre>
880      *
881      * @param input the input to encode
882      * @return the encoded result
883      */
forCDATA(String input)884     public static String forCDATA(String input) {
885         return encode(Encoders.CDATA_ENCODER, input);
886     }
887 
888     /**
889      * See {@link #forCDATA(String)} for description of encoding.  This
890      * version writes directly to a Writer without an intervening string.
891      *
892      * @param out where to write encoded output
893      * @param input the input string to encode
894      * @throws IOException if thrown by writer
895      */
forCDATA(Writer out, String input)896     public static void forCDATA(Writer out, String input)
897         throws IOException
898     {
899         encode(Encoders.CDATA_ENCODER, out, input);
900     }
901 
902     /**
903      * Encodes for a Java string.  This method will use "\b", "\t", "\r", "\f",
904      * "\n", "\"", "\'", "\\", octal and unicode escapes.  Valid surrogate
905      * pairing is not checked.   The caller must provide the enclosing quotation
906      * characters.  This method is useful for when writing code generators and
907      * outputting debug messages.
908      *
909      * <pre>
910      *     out.println("public class Hello {");
911      *     out.println("    public static void main(String[] args) {");
912      *     out.println("        System.out.println(\"" + Encode.forJava(message) + "\");");
913      *     out.println("    }");
914      *     out.println("}");
915      * </pre>
916      *
917      * @param input the input to encode
918      * @return the input encoded for java strings.
919      */
forJava(String input)920     public static String forJava(String input) {
921         return encode(Encoders.JAVA_ENCODER, input);
922     }
923 
924     /**
925      * See {@link #forJava(String)} for description of encoding.  This
926      * version writes directly to a Writer without an intervening string.
927      *
928      * @param out where to write encoded output
929      * @param input the input string to encode
930      * @throws IOException if thrown by writer
931      */
forJava(Writer out, String input)932     public static void forJava(Writer out, String input)
933         throws IOException
934     {
935         encode(Encoders.JAVA_ENCODER, out, input);
936     }
937 
938     /**
939      * <p>Encodes for a JavaScript string.  It is safe for use in HTML
940      * script attributes (such as {@code onclick}), script
941      * blocks, JSON files, and JavaScript source.  The caller MUST
942      * provide the surrounding quotation characters for the string.
943      * Since this performs additional encoding so it can work in all
944      * of the JavaScript contexts listed, it may be slightly less
945      * efficient than using one of the methods targeted to a specific
946      * JavaScript context ({@link #forJavaScriptAttribute(String)},
947      * {@link #forJavaScriptBlock}, {@link #forJavaScriptSource}).
948      * Unless you are interested in saving a few bytes of output or
949      * are writing a framework on top of this library, it is recommend
950      * that you use this method over the others.</p>
951      *
952      * <b>Example JSP Usage:</b>
953      * <pre>
954      *    &lt;button onclick="alert('&lt;%=Encode.forJavaScript(data)%&gt;');"&gt;
955      *    &lt;script type="text/javascript"&gt;
956      *        var data = "&lt;%=Encode.forJavaScript(data)%&gt;";
957      *    &lt;/script&gt;
958      * </pre>
959      *
960      * <table cellspacing="1" class="memberSummary" cellpadding="1" border="0">
961      *   <caption><b>Encoding Description</b></caption>
962      *   <thead>
963      *     <tr>
964      *       <th align="left" colspan="2" class="colFirst">Input Character</th>
965      *       <th align="left" class="colLast">Encoded Result</th>
966      *       <th align="left" class="colLast">Notes</th>
967      *     </tr>
968      *   </thead>
969      *   <tbody>
970      *     <tr class="altColor">
971      *       <td class="colFirst">U+0008</td><td><i>BS</i></td>
972      *       <td class="colLast"><code>\b</code></td>
973      *       <td class="colLast">Backspace character</td>
974      *     </tr>
975      *     <tr class="rowColor">
976      *       <td class="colFirst">U+0009</td><td><i>HT</i></td>
977      *       <td class="colLast"><code>\t</code></td>
978      *       <td class="colLast">Horizontal tab character</td>
979      *     </tr>
980      *     <tr class="altColor">
981      *       <td class="colFirst">U+000A</td><td><i>LF</i></td>
982      *       <td class="colLast"><code>\n</code></td>
983      *       <td class="colLast">Line feed character</td>
984      *     </tr>
985      *     <tr class="rowColor">
986      *       <td class="colFirst">U+000C</td><td><i>FF</i></td>
987      *       <td class="colLast"><code>\f</code></td>
988      *       <td class="colLast">Form feed character</td>
989      *     </tr>
990      *     <tr class="altColor">
991      *       <td class="colFirst">U+000D</td><td><i>CR</i></td>
992      *       <td class="colLast"><code>\r</code></td>
993      *       <td class="colLast">Carriage return character</td>
994      *     </tr>
995      *     <tr class="rowColor">
996      *       <td class="colFirst">U+0022</td><td><code>"</code></td>
997      *       <td class="colLast"><code>\x22</code></td>
998      *       <td class="colLast">The encoding <code>\"</code> is not used here because
999      *       it is not safe for use in HTML attributes.  (In HTML
1000      *       attributes, it would also be correct to use
1001      *       "\&amp;quot;".)</td>
1002      *     </tr>
1003      *     <tr class="altColor">
1004      *       <td class="colFirst">U+0026</td><td><code>&amp;</code></td>
1005      *       <td class="colLast"><code>\x26</code></td>
1006      *       <td class="colLast">Ampersand character</td>
1007      *     </tr>
1008      *     <tr class="rowColor">
1009      *       <td class="colFirst">U+0027</td><td><code>'</code></td>
1010      *       <td class="colLast"><code>\x27</code></td>
1011      *       <td class="colLast">The encoding <code>\'</code> is not used here because
1012      *       it is not safe for use in HTML attributes.  (In HTML
1013      *       attributes, it would also be correct to use
1014      *       "\&amp;#39;".)</td>
1015      *     </tr>
1016      *     <tr class="altColor">
1017      *       <td class="colFirst">U+002F</td><td><code>/</code></td>
1018      *       <td class="colLast"><code>\/</code></td>
1019      *       <td class="colLast">This encoding is used to avoid an input sequence
1020      *       "&lt;/" from prematurely terminating a &lt;/script&gt;
1021      *       block.</td>
1022      *     </tr>
1023      *     <tr class="rowColor">
1024      *       <td class="colFirst">U+005C</td><td><code>\</code></td>
1025      *       <td class="colLast"><code>\\</code></td>
1026      *       <td class="colLast"></td>
1027      *     </tr>
1028      *     <tr class="altColor">
1029      *       <td class="colFirst" colspan="2">U+0000&nbsp;to&nbsp;U+001F</td>
1030      *       <td class="colLast"><code>\x##</code></td>
1031      *       <td class="colLast">Hexadecimal encoding is used for characters in this
1032      *       range that were not already mentioned in above.</td>
1033      *     </tr>
1034      *   </tbody>
1035      * </table>
1036      *
1037      * @param input the input string to encode
1038      * @return the input encoded for JavaScript
1039      * @see #forJavaScriptAttribute(String)
1040      * @see #forJavaScriptBlock(String)
1041      */
forJavaScript(String input)1042     public static String forJavaScript(String input) {
1043         return encode(Encoders.JAVASCRIPT_ENCODER, input);
1044     }
1045 
1046     /**
1047      * See {@link #forJavaScript(String)} for description of encoding.  This
1048      * version writes directly to a Writer without an intervening string.
1049      *
1050      * @param out where to write encoded output
1051      * @param input the input string to encode
1052      * @throws IOException if thrown by writer
1053      */
forJavaScript(Writer out, String input)1054     public static void forJavaScript(Writer out, String input)
1055         throws IOException
1056     {
1057         encode(Encoders.JAVASCRIPT_ENCODER, out, input);
1058     }
1059 
1060     /**
1061      * <p>This method encodes for JavaScript strings contained within
1062      * HTML script attributes (such as {@code onclick}).  It is
1063      * NOT safe for use in script blocks.  The caller MUST provide the
1064      * surrounding quotation characters.  This method performs the
1065      * same encode as {@link #forJavaScript(String)} with the
1066      * exception that <code>/</code> is not escaped.</p>
1067      *
1068      * <p><strong>Unless you are interested in saving a few bytes of
1069      * output or are writing a framework on top of this library, it is
1070      * recommend that you use {@link #forJavaScript(String)} over this
1071      * method.</strong></p>
1072      *
1073      * <b>Example JSP Usage:</b>
1074      * <pre>
1075      *    &lt;button onclick="alert('&lt;%=Encode.forJavaScriptAttribute(data)%&gt;');"&gt;
1076      * </pre>
1077      *
1078      * @param input the input string to encode
1079      * @return the input encoded for JavaScript
1080      * @see #forJavaScript(String)
1081      * @see #forJavaScriptBlock(String)
1082      */
forJavaScriptAttribute(String input)1083     public static String forJavaScriptAttribute(String input) {
1084         return encode(Encoders.JAVASCRIPT_ATTRIBUTE_ENCODER, input);
1085     }
1086 
1087     /**
1088      * See {@link #forJavaScriptAttribute(String)} for description of encoding.  This
1089      * version writes directly to a Writer without an intervening string.
1090      *
1091      * @param out where to write encoded output
1092      * @param input the input string to encode
1093      * @throws IOException if thrown by writer
1094      */
forJavaScriptAttribute(Writer out, String input)1095     public static void forJavaScriptAttribute(Writer out, String input)
1096         throws IOException
1097     {
1098         encode(Encoders.JAVASCRIPT_ATTRIBUTE_ENCODER, out, input);
1099     }
1100 
1101     /**
1102      * <p>This method encodes for JavaScript strings contained within
1103      * HTML script blocks.  It is NOT safe for use in script
1104      * attributes (such as <code>onclick</code>).  The caller must
1105      * provide the surrounding quotation characters.  This method
1106      * performs the same encode as {@link #forJavaScript(String)} with
1107      * the exception that <code>"</code> and <code>'</code> are
1108      * encoded as <code>\"</code> and <code>\'</code>
1109      * respectively.</p>
1110      *
1111      * <p><strong>Unless you are interested in saving a few bytes of
1112      * output or are writing a framework on top of this library, it is
1113      * recommend that you use {@link #forJavaScript(String)} over this
1114      * method.</strong></p>
1115      *
1116      * <b>Example JSP Usage:</b>
1117      * <pre>
1118      *    &lt;script type="text/javascript"&gt;
1119      *        var data = "&lt;%=Encode.forJavaScriptBlock(data)%&gt;";
1120      *    &lt;/script&gt;
1121      * </pre>
1122      *
1123      * @param input the input string to encode
1124      * @return the input encoded for JavaScript
1125      * @see #forJavaScript(String)
1126      * @see #forJavaScriptAttribute(String)
1127      */
forJavaScriptBlock(String input)1128     public static String forJavaScriptBlock(String input) {
1129         return encode(Encoders.JAVASCRIPT_BLOCK_ENCODER, input);
1130     }
1131 
1132     /**
1133      * See {@link #forJavaScriptBlock(String)} for description of encoding.  This
1134      * version writes directly to a Writer without an intervening string.
1135      *
1136      * @param out where to write encoded output
1137      * @param input the input string to encode
1138      * @throws IOException if thrown by writer
1139      */
forJavaScriptBlock(Writer out, String input)1140     public static void forJavaScriptBlock(Writer out, String input)
1141         throws IOException
1142     {
1143         encode(Encoders.JAVASCRIPT_BLOCK_ENCODER, out, input);
1144     }
1145 
1146     /**
1147      * <p>This method encodes for JavaScript strings contained within
1148      * a JavaScript or JSON file.  <strong>This method is NOT safe for
1149      * use in ANY context embedded in HTML.</strong> The caller must
1150      * provide the surrounding quotation characters.  This method
1151      * performs the same encode as {@link #forJavaScript(String)} with
1152      * the exception that <code>/</code> and <code>&amp;</code> are not
1153      * escaped and <code>"</code> and <code>'</code> are encoded as
1154      * <code>\"</code> and <code>\'</code> respectively.</p>
1155      *
1156      * <p><strong>Unless you are interested in saving a few bytes of
1157      * output or are writing a framework on top of this library, it is
1158      * recommend that you use {@link #forJavaScript(String)} over this
1159      * method.</strong></p>
1160      *
1161      * <b>Example JSP Usage:</b>
1162      * This example is serving up JavaScript source directly:
1163      * <pre>
1164      *    &lt;%@page contentType="text/javascript; charset=UTF-8"%&gt;
1165      *    var data = "&lt;%=Encode.forJavaScriptSource(data)%&gt;";
1166      * </pre>
1167      *
1168      * This example is serving up JSON data (users of this use-case
1169      * are encouraged to read up on "JSON Hijacking"):
1170      * <pre>
1171      *    &lt;%@page contentType="application/json; charset=UTF-8"%&gt;
1172      *    &lt;% myapp.jsonHijackingPreventionMeasure(); %&gt;
1173      *    {"data":"&lt;%=Encode.forJavaScriptSource(data)%&gt;"}
1174      * </pre>
1175      *
1176      * @param input the input string to encode
1177      * @return the input encoded for JavaScript
1178      * @see #forJavaScript(String)
1179      * @see #forJavaScriptAttribute(String)
1180      * @see #forJavaScriptBlock(String)
1181      */
forJavaScriptSource(String input)1182     public static String forJavaScriptSource(String input) {
1183         return encode(Encoders.JAVASCRIPT_SOURCE_ENCODER, input);
1184     }
1185 
1186     /**
1187      * See {@link #forJavaScriptSource(String)} for description of encoding.  This
1188      * version writes directly to a Writer without an intervening string.
1189      *
1190      * @param out where to write encoded output
1191      * @param input the input string to encode
1192      * @throws IOException if thrown by writer
1193      */
forJavaScriptSource(Writer out, String input)1194     public static void forJavaScriptSource(Writer out, String input)
1195         throws IOException
1196     {
1197         encode(Encoders.JAVASCRIPT_SOURCE_ENCODER, out, input);
1198     }
1199 
1200     // Additional?
1201     // MySQL
1202     // PostreSQL
1203     // Oracle
1204     // ...
1205 
1206     /**
1207      * Core encoding loop shared by public methods.  It first uses the
1208      * encoder to scan the input for characters that need encoding.  If
1209      * no characters require encoding, the input string is returned.
1210      * Otherwise a buffer is used to encode the remainder
1211      * of the input.
1212      *
1213      * @param encoder the encoder to use
1214      * @param str the string to encode
1215      * @return the input string encoded with the provided encoder.
1216      */
encode(Encoder encoder, String str)1217     static String encode(Encoder encoder, String str) {
1218         if (str == null) {
1219             // consistent with String.valueOf(...) use "null" for null.
1220             str = "null";
1221         }
1222 
1223         // quick pass--see if we need to actually encode anything, if not
1224         // return the value unchanged.
1225         final int n = str.length();
1226         int j = encoder.firstEncodedOffset(str, 0, n);
1227 
1228         if (j == n) {
1229             return str;
1230         }
1231 
1232         // otherwise, we need to encode.  We use a buffer to avoid
1233         // excessive memory allocation for these calls.  Note: this means that
1234         // an encoder implementation must NEVER call this method internally.
1235         return new Buffer().encode(encoder, str, j);
1236     }
1237 
1238     /**
1239      * Core encoding loop shared by public methods.  It first uses the
1240      * encoder to scan the input for characters that need encoding.  If no
1241      * characters require encoding, the input string is written directly to
1242      * the writer.  Otherwise a buffer is used to encode the
1243      * remainder of the input to the buffers.  This version saves a wrapping
1244      * in an String.
1245      *
1246      * @param encoder the encoder to use
1247      * @param out the writer for the encoded output
1248      * @param str the string to encode
1249      * @throws IOException if thrown by the writer
1250      */
encode(Encoder encoder, Writer out, String str)1251     static void encode(Encoder encoder, Writer out, String str)
1252         throws IOException
1253     {
1254         if (str == null) {
1255             // consistent with String.valueOf(...) use "null" for null.
1256             str = "null";
1257         }
1258 
1259         // quick pass--see if we need to actually encode anything, if not
1260         // return the value unchanged.
1261         final int n = str.length();
1262         int j = encoder.firstEncodedOffset(str, 0, n);
1263 
1264         if (j == n) {
1265             out.write(str);
1266             return;
1267         }
1268 
1269         // otherwise, we need to encode.  We use a buffer to avoid
1270         // excessive memory allocation for these calls.  Note: this means that
1271         // an encoder implementation must NEVER call this method internally.
1272         new Buffer().encode(encoder, out, str, j);
1273     }
1274 
1275     /**
1276      * A buffer used for encoding.
1277      */
1278     static class Buffer {
1279         /**
1280          * Input buffer size, used to extract a copy of the input
1281          * from a string and then send to the encoder.
1282          */
1283         static final int INPUT_BUFFER_SIZE = 1024;
1284         /**
1285          * Output buffer size used to store the encoded output before
1286          * wrapping in a string.
1287          */
1288         static final int OUTPUT_BUFFER_SIZE = INPUT_BUFFER_SIZE * 2;
1289 
1290         /**
1291          * The input buffer.  A heap-allocated, array-backed buffer of
1292          * INPUT_BUFFER_SIZE used for holding the characters to encode.
1293          */
1294         final CharBuffer _input = CharBuffer.allocate(INPUT_BUFFER_SIZE);
1295         /**
1296          * The output buffer.  A heap-allocated, array-backed buffer of
1297          * OUTPUT_BUFFER_SIZE used for holding the encoded output.
1298          */
1299         final CharBuffer _output = CharBuffer.allocate(OUTPUT_BUFFER_SIZE);
1300 
1301         /**
1302          * The core String encoding routine of this class.  It uses the input
1303          * and output buffers to allow the encoders to work in reuse arrays.
1304          * When the input and/or output exceeds the capacity of the reused
1305          * arrays, temporary ones are allocated and then discarded after
1306          * the encode is done.
1307          *
1308          * @param encoder the encoder to use
1309          * @param str the string to encode
1310          * @param j the offset in {@code str} to start encoding
1311          * @return the encoded result
1312          */
encode(Encoder encoder, String str, int j)1313         String encode(Encoder encoder, String str, int j) {
1314             final int n = str.length();
1315             final int remaining = n - j;
1316 
1317             if (remaining <= INPUT_BUFFER_SIZE && j <= OUTPUT_BUFFER_SIZE) {
1318                 // the remaining input to encode fits completely in the pre-
1319                 // allocated buffer.
1320                 str.getChars(0, j, _output.array(), 0);
1321                 str.getChars(j, n, _input.array(), 0);
1322 
1323                 _input.limit(remaining).position(0);
1324                 _output.clear().position(j);
1325 
1326                 CoderResult cr = encoder.encodeArrays(_input, _output, true);
1327                 if (cr.isUnderflow()) {
1328                     return new String(_output.array(), 0, _output.position());
1329                 }
1330 
1331                 // else, it's an overflow, we need to use a new output buffer
1332                 // we'll allocate this buffer to be the exact size of the worst
1333                 // case, guaranteeing a second overflow would not be possible.
1334                 CharBuffer tmp = CharBuffer.allocate(_output.position()
1335                             + encoder.maxEncodedLength(_input.remaining()));
1336 
1337                 // copy over everything that has been encoded so far
1338                 tmp.put(_output.array(), 0, _output.position());
1339 
1340                 cr = encoder.encodeArrays(_input, tmp, true);
1341                 if (cr.isOverflow()) {
1342                     throw new AssertionError("unexpected result from encoder");
1343                 }
1344 
1345                 return new String(tmp.array(), 0, tmp.position());
1346             } else {
1347                 // the input it too large for our pre-allocated buffers
1348                 // we'll use a temporary direct heap allocation
1349                 final int m = j + encoder.maxEncodedLength(remaining);
1350                 CharBuffer buffer = CharBuffer.allocate(m);
1351                 str.getChars(0, j, buffer.array(), 0);
1352                 str.getChars(j, n, buffer.array(), m - remaining);
1353 
1354                 CharBuffer input = buffer.duplicate();
1355                 input.limit(m).position(m-remaining);
1356                 buffer.position(j);
1357 
1358                 CoderResult cr = encoder.encodeArrays(input, buffer, true);
1359 
1360                 if (cr.isOverflow()) {
1361                     throw new AssertionError("unexpected result from encoder");
1362                 }
1363 
1364                 return new String(buffer.array(), 0, buffer.position());
1365             }
1366         }
1367 
1368         /**
1369          * The core Writer encoding routing of this class.  It uses the
1370          * input and output buffers to allow the encoders to reuse arrays.
1371          * Unlike the string version, this method will never allocate more
1372          * memory, instead encoding is done in batches and flushed to the
1373          * writer in batches as large as possible.
1374          *
1375          * @param encoder the encoder to use
1376          * @param out where to write the encoded output
1377          * @param str the string to encode
1378          * @param j the position in the string at which the first character
1379          * needs encoding.
1380          * @throws IOException if thrown by the writer.
1381          */
encode(Encoder encoder, Writer out, String str, int j)1382         void encode(Encoder encoder, Writer out, String str, int j)
1383             throws IOException
1384         {
1385             out.write(str, 0, j);
1386 
1387             final int n = str.length();
1388 
1389             _input.clear();
1390             _output.clear();
1391 
1392             final char[] inputArray = _input.array();
1393             final char[] outputArray = _output.array();
1394 
1395             for (;;) {
1396                 final int remainingInput = n - j;
1397                 final int startPosition = _input.position();
1398                 final int batchSize = Math.min(remainingInput, _input.remaining());
1399                 str.getChars(j, j+batchSize, inputArray, startPosition);
1400 
1401                 _input.limit(startPosition + batchSize);
1402 
1403 
1404                 for (;;) {
1405                     CoderResult cr = encoder.encodeArrays(
1406                         _input, _output, batchSize == remainingInput);
1407 
1408                     if (cr.isUnderflow()) {
1409                         // get next input batch
1410                         break;
1411                     }
1412 
1413                     // else, output buffer full, flush and continue.
1414                     out.write(outputArray, 0, _output.position());
1415                     _output.clear();
1416                 }
1417 
1418                 j += _input.position() - startPosition;
1419 
1420                 if (j == n) {
1421                     // done.  flush remaining output buffer and return
1422                     out.write(outputArray, 0, _output.position());
1423                     return;
1424                 }
1425 
1426                 _input.compact();
1427             }
1428         }
1429     }
1430 }
1431